Wednesday, September 11, 2019

List links in HTML files

Wednesday, September 11, 2019 Python No comments

This Python script walks through a directory of HTML files and extracts all the links. It outputs one line per link with each line also containing the pages in which the link appears.

import os
from os.path import join, getsize
from bs4 import BeautifulSoup
from urllib.parse import urlparse

# Links as keys, list of pages where the link appears as values.
links = {}

def get_links(filepath):
    with open(filepath, 'r') as fhandle:
        try:
            text = BeautifulSoup(fhandle, features="lxml")
        except UnicodeDecodeError:
            print()
            print("UnicodeDecodeError: " + filepath)
            return
        # Iterate through all the "a" tags
        for link in text.find_all("a"):
            url = link.get('href')
            # Ignore relative links and certain domain names.
            if url and urlparse(url).netloc \
                and urlparse(url).netloc not in ["example.com"]:
                # Append link to dictionary.
                if url not in links:
                    links[url] = []
                # Limit number of pages to associate with link.
                if len(links[url]) < 8:
                    links[url].append(filepath)

file_count = 0
fout = open("links.txt", "w")
os.chdir("../flattened")
# Iterate through all the files in the flattened site
for root, dirs, files in os.walk('.'):
    for f in files:
        if f not in [".DS_Store"]:
            get_links(join(root, f))
            print(".", end="", flush=True)
            file_count += 1
    # Stop after some files
    # if file_count > 100: break
print()

# Write file with links.
for link, pages in links.items():
    fout.write(link + ", " + str(pages) + "\n")
fout.close()

Slack a Day

Wednesday, September 11, 2019

List links in HTML files

0 comments:

Popular Posts

Recent Posts

Categories

Unordered List

Text Widget

Pages

Blog Archive

About Me