import os
from os.path import join, getsize
from bs4 import BeautifulSoup
from urllib.parse import urlparse
# Links as keys, list of pages where the link appears as values.
links = {}
def get_links(filepath):
with open(filepath, 'r') as fhandle:
try:
text = BeautifulSoup(fhandle, features="lxml")
except UnicodeDecodeError:
print()
print("UnicodeDecodeError: " + filepath)
return
# Iterate through all the "a" tags
for link in text.find_all("a"):
url = link.get('href')
# Ignore relative links and certain domain names.
if url and urlparse(url).netloc \
and urlparse(url).netloc not in ["example.com"]:
# Append link to dictionary.
if url not in links:
links[url] = []
# Limit number of pages to associate with link.
if len(links[url]) < 8:
links[url].append(filepath)
file_count = 0
fout = open("links.txt", "w")
os.chdir("../flattened")
# Iterate through all the files in the flattened site
for root, dirs, files in os.walk('.'):
for f in files:
if f not in [".DS_Store"]:
get_links(join(root, f))
print(".", end="", flush=True)
file_count += 1
# Stop after some files
# if file_count > 100: break
print()
# Write file with links.
for link, pages in links.items():
fout.write(link + ", " + str(pages) + "\n")
fout.close()
List links in HTML files
This Python script walks through a directory of HTML files and extracts all the links. It outputs one line per link with each line also containing the pages in which the link appears.
0 comments:
Post a Comment