import os import json import re from tqdm import tqdm from banjo.runner import setup_django import requests from pathlib import Path from collections import defaultdict from argparse import ArgumentParser import pronouncing import gzip corpus_url = "http://static.decontextualize.com/gutenberg-poetry-v001.ndjson.gz" corpus_file = Path("corpus.gz") corpus_length = 3085117 def download_corpus(): "Downloads a few million lines of poetry" if not corpus_file.exists(): print("Downloading a file with three million lines of poetry...") download = requests.get(corpus_url, stream=True) with corpus_file.open("wb") as f: for chunk in download.iter_content(chunk_size=1024): if chunk: f.write(chunk) f.flush() def read_corpus(limit=None): "Reads one line at a time from the corpus" for i, line in enumerate(gzip.open(corpus_file)): if limit and i == limit: break yield json.loads(line.strip()) def clean(line): "Returns lowercase text without any punctuation" return re.sub('[^a-z ]', '', line.lower()).strip() def get_last_word(line): """Gets the last word from a line. Strips out punctuation, then splits the line on spaces and returns the final word. """ parts = clean(line).split() if parts: return parts[-1] def import_poems(limit=None): "Imports each line into the app" database_file = Path("database.sqlite") if database_file.exists(): database_file.unlink() setup_django() from app.models import Line, Poem, Rhyme print("Importing lines into the app's database...") line_counts = defaultdict(int) for row in tqdm(read_corpus(limit=limit), total=limit or corpus_length): text = row['s'] last_word = get_last_word(text) if not last_word: continue last_word_phones = pronouncing.phones_for_word(last_word) if last_word_phones: rhyming_phones = pronouncing.rhyming_part(last_word_phones[0]) rhyme, _ = Rhyme.objects.get_or_create(phones=rhyming_phones) else: rhyme=None poem, _ = Poem.objects.get_or_create(gutenberg_id=row['gid']) line = Line.objects.create( text=text, clean_text=clean(text), line_number=line_counts[row['gid']], poem=poem, rhyme=rhyme, ) line_counts[row['gid']] += 1 parser = ArgumentParser() parser.add_argument('-l', '--limit', type=int) args = parser.parse_args() download_corpus() import_poems(args.limit)