lab_server/poem_server/import_poems.py

import os
import json
import re
from tqdm import tqdm
from banjo.runner import setup_django
import requests
from pathlib import Path
from collections import defaultdict
from argparse import ArgumentParser
import pronouncing
import gzip

corpus_url = "http://static.decontextualize.com/gutenberg-poetry-v001.ndjson.gz"
corpus_file = Path("corpus.gz")
corpus_length = 3085117

def download_corpus():
    "Downloads a few million lines of poetry"
    if not corpus_file.exists():
        print("Downloading a file with three million lines of poetry...")
        download = requests.get(corpus_url, stream=True)
        with corpus_file.open("wb") as f:
            for chunk in download.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    f.flush()

def read_corpus(limit=None):
    "Reads one line at a time from the corpus"
    for i, line in enumerate(gzip.open(corpus_file)):
        if limit and i == limit:
            break
        yield json.loads(line.strip())

def clean(line):
    "Returns lowercase text without any punctuation"
    return re.sub('[^a-z ]', '', line.lower()).strip()

def get_last_word(line):
    """Gets the last word from a line.
    Strips out punctuation, then splits the line on spaces and returns the final word.
    """
    parts = clean(line).split()
    if parts:
        return parts[-1]

def import_poems(limit=None):
    "Imports each line into the app"
    database_file = Path("database.sqlite")
    if database_file.exists():
        database_file.unlink()
    setup_django()
    from app.models import Line, Poem, Rhyme
    print("Importing lines into the app's database...")
    line_counts = defaultdict(int)
    for row in tqdm(read_corpus(limit=limit), total=limit or corpus_length):
        text = row['s']
        last_word = get_last_word(text)
        if not last_word:
            continue
        last_word_phones = pronouncing.phones_for_word(last_word)
        if last_word_phones:
            rhyming_phones = pronouncing.rhyming_part(last_word_phones[0])
            rhyme, _ = Rhyme.objects.get_or_create(phones=rhyming_phones)
        else:
            rhyme=None
        poem, _ = Poem.objects.get_or_create(gutenberg_id=row['gid'])
        line = Line.objects.create(
            text=text,
            clean_text=clean(text),
            line_number=line_counts[row['gid']],
            poem=poem,
            rhyme=rhyme,
        )
        line_counts[row['gid']] += 1

parser = ArgumentParser()
parser.add_argument('-l', '--limit', type=int)
args = parser.parse_args()
download_corpus()
import_poems(args.limit)