generated from mwc/lab_server
82 lines
2.6 KiB
Python
82 lines
2.6 KiB
Python
import os
|
|
import json
|
|
import re
|
|
from tqdm import tqdm
|
|
from banjo.runner import setup_django
|
|
import requests
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
from argparse import ArgumentParser
|
|
import pronouncing
|
|
import gzip
|
|
|
|
corpus_url = "http://static.decontextualize.com/gutenberg-poetry-v001.ndjson.gz"
|
|
corpus_file = Path("corpus.gz")
|
|
corpus_length = 3085117
|
|
|
|
def download_corpus():
|
|
"Downloads a few million lines of poetry"
|
|
if not corpus_file.exists():
|
|
print("Downloading a file with three million lines of poetry...")
|
|
download = requests.get(corpus_url, stream=True)
|
|
with corpus_file.open("wb") as f:
|
|
for chunk in download.iter_content(chunk_size=1024):
|
|
if chunk:
|
|
f.write(chunk)
|
|
f.flush()
|
|
|
|
def read_corpus(limit=None):
|
|
"Reads one line at a time from the corpus"
|
|
for i, line in enumerate(gzip.open(corpus_file)):
|
|
if limit and i == limit:
|
|
break
|
|
yield json.loads(line.strip())
|
|
|
|
def clean(line):
|
|
"Returns lowercase text without any punctuation"
|
|
return re.sub('[^a-z ]', '', line.lower()).strip()
|
|
|
|
def get_last_word(line):
|
|
"""Gets the last word from a line.
|
|
Strips out punctuation, then splits the line on spaces and returns the final word.
|
|
"""
|
|
parts = clean(line).split()
|
|
if parts:
|
|
return parts[-1]
|
|
|
|
def import_poems(limit=None):
|
|
"Imports each line into the app"
|
|
database_file = Path("database.sqlite")
|
|
if database_file.exists():
|
|
database_file.unlink()
|
|
setup_django()
|
|
from app.models import Line, Poem, Rhyme
|
|
print("Importing lines into the app's database...")
|
|
line_counts = defaultdict(int)
|
|
for row in tqdm(read_corpus(limit=limit), total=limit or corpus_length):
|
|
text = row['s']
|
|
last_word = get_last_word(text)
|
|
if not last_word:
|
|
continue
|
|
last_word_phones = pronouncing.phones_for_word(last_word)
|
|
if last_word_phones:
|
|
rhyming_phones = pronouncing.rhyming_part(last_word_phones[0])
|
|
rhyme, _ = Rhyme.objects.get_or_create(phones=rhyming_phones)
|
|
else:
|
|
rhyme=None
|
|
poem, _ = Poem.objects.get_or_create(gutenberg_id=row['gid'])
|
|
line = Line.objects.create(
|
|
text=text,
|
|
clean_text=clean(text),
|
|
line_number=line_counts[row['gid']],
|
|
poem=poem,
|
|
rhyme=rhyme,
|
|
)
|
|
line_counts[row['gid']] += 1
|
|
|
|
parser = ArgumentParser()
|
|
parser.add_argument('-l', '--limit', type=int)
|
|
args = parser.parse_args()
|
|
download_corpus()
|
|
import_poems(args.limit)
|