The goal of efficiency is more slack.

Tuesday, September 17, 2019

Sites for Django

How to use the Sites package for Django.

Follow instructions in documentation to install the Sites package. Then create a migration in one of your apps.

from __future__ import unicode_literals

from django.db import migrations

def set_site_name(apps, schema_editor):
    Site = apps.get_model('sites', 'site')
    try:
        site = Site.objects.get(id=1)
    except Site.DoesNotExist:
        site = Site()
    site.name = "Astrobiology"
    site.domain = "astrobiology.nasa.gov"
    site.save()

class Migration(migrations.Migration):

    dependencies = [
        ('sites', '0002_alter_domain_unique'),
    ]

    operations = [
        migrations.RunPython(set_site_name),
    ]


Enter the latest migration from the "sites" app as a dependency to ensure there is a database table for sites.

On the development computer, run ./manage shell and change the domain of the site object to 127.0.0.1:8000.

Wednesday, September 11, 2019

List links in HTML files

This Python script walks through a directory of HTML files and extracts all the links. It outputs one line per link with each line also containing the pages in which the link appears.

import os
from os.path import join, getsize
from bs4 import BeautifulSoup
from urllib.parse import urlparse

# Links as keys, list of pages where the link appears as values.
links = {}

def get_links(filepath):
    with open(filepath, 'r') as fhandle:
        try:
            text = BeautifulSoup(fhandle, features="lxml")
        except UnicodeDecodeError:
            print()
            print("UnicodeDecodeError: " + filepath)
            return
        # Iterate through all the "a" tags
        for link in text.find_all("a"):
            url = link.get('href')
            # Ignore relative links and certain domain names.
            if url and urlparse(url).netloc \
                and urlparse(url).netloc not in ["example.com"]:
                # Append link to dictionary.
                if url not in links:
                    links[url] = []
                # Limit number of pages to associate with link.
                if len(links[url]) < 8:
                    links[url].append(filepath)

file_count = 0
fout = open("links.txt", "w")
os.chdir("../flattened")
# Iterate through all the files in the flattened site
for root, dirs, files in os.walk('.'):
    for f in files:
        if f not in [".DS_Store"]:
            get_links(join(root, f))
            print(".", end="", flush=True)
            file_count += 1
    # Stop after some files
    # if file_count > 100: break
print()

# Write file with links.
for link, pages in links.items():
    fout.write(link + ", " + str(pages) + "\n")
fout.close()

Popular Posts

Recent Posts

Unordered List

Text Widget

Pages

Powered by Blogger.
Scroll To Top