import os from os.path import join, getsize from bs4 import BeautifulSoup from urllib.parse import urlparse # Links as keys, list of pages where the link appears as values. links = {} def get_links(filepath): with open(filepath, 'r') as fhandle: try: text = BeautifulSoup(fhandle, features="lxml") except UnicodeDecodeError: print() print("UnicodeDecodeError: " + filepath) return # Iterate through all the "a" tags for link in text.find_all("a"): url = link.get('href') # Ignore relative links and certain domain names. if url and urlparse(url).netloc \ and urlparse(url).netloc not in ["example.com"]: # Append link to dictionary. if url not in links: links[url] = [] # Limit number of pages to associate with link. if len(links[url]) < 8: links[url].append(filepath) file_count = 0 fout = open("links.txt", "w") os.chdir("../flattened") # Iterate through all the files in the flattened site for root, dirs, files in os.walk('.'): for f in files: if f not in [".DS_Store"]: get_links(join(root, f)) print(".", end="", flush=True) file_count += 1 # Stop after some files # if file_count > 100: break print() # Write file with links. for link, pages in links.items(): fout.write(link + ", " + str(pages) + "\n") fout.close()
List links in HTML files
This Python script walks through a directory of HTML files and extracts all the links. It outputs one line per link with each line also containing the pages in which the link appears.
0 comments:
Post a Comment