initial commit
This commit is contained in:
51
poem_server/app/models.py
Normal file
51
poem_server/app/models.py
Normal file
@@ -0,0 +1,51 @@
|
||||
from banjo.models import Model, StringField, IntegerField, ForeignKey
|
||||
from banjo.http import BadRequest
|
||||
import pronouncing
|
||||
import re
|
||||
|
||||
class Line(Model):
|
||||
text = StringField()
|
||||
clean_text = StringField()
|
||||
line_number = IntegerField()
|
||||
poem = ForeignKey("Poem", related_name="lines")
|
||||
rhyme = ForeignKey("Rhyme", related_name="lines", null=True)
|
||||
|
||||
def __str__(self):
|
||||
"The string representation of a line is just its text"
|
||||
return self.text
|
||||
|
||||
"Returns lowercase text without any punctuation"
|
||||
return re.sub('[^a-z ]', '', self.text.lower())
|
||||
|
||||
def last_word(self):
|
||||
"Gets the last word from a line, lower case and without punctuation."
|
||||
parts = self.clean_text.split()
|
||||
if parts:
|
||||
return parts[-1]
|
||||
|
||||
def rhyming_lines(self):
|
||||
if self.rhyme:
|
||||
return self.rhyme.lines.exclude(clean_text__endswith=self.last_word())
|
||||
else:
|
||||
return []
|
||||
|
||||
class Poem(Model):
|
||||
gutenberg_id = IntegerField(unique=True)
|
||||
|
||||
def __str__(self):
|
||||
"The string representation of a poem is all its lines, in order"
|
||||
return '\n'.join(line.text for line in self.lines.order_by('line_number'))
|
||||
|
||||
class Rhyme(Model):
|
||||
phones = StringField()
|
||||
|
||||
@classmethod
|
||||
def get_rhyme_for_word(self, word):
|
||||
phones = pronouncing.phones_for_word(word)
|
||||
if not phones:
|
||||
raise BadRequest(f"Couldn't figure out how to pronounce {word}")
|
||||
rhyming_phones = pronouncing.rhyming_part(phones[0])
|
||||
try:
|
||||
return Rhyme.objects.get(phones=rhyming_phones)
|
||||
except Rhyme.DoesNotExist:
|
||||
raise BadRequest(f"Sorry, no lines rhyme with {word}")
|
8
poem_server/app/views.py
Normal file
8
poem_server/app/views.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from banjo.urls import route_get, route_post
|
||||
from banjo.http import BadRequest, NotFound
|
||||
from app.models import Line, Poem, Rhyme
|
||||
from random import choice, sample
|
||||
|
||||
@route_get('lines/random', args={})
|
||||
def get_random_line(params):
|
||||
return {'line': Line.objects.random().text}
|
81
poem_server/import_poems.py
Normal file
81
poem_server/import_poems.py
Normal file
@@ -0,0 +1,81 @@
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
from tqdm import tqdm
|
||||
from banjo.runner import setup_django
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
from argparse import ArgumentParser
|
||||
import pronouncing
|
||||
import gzip
|
||||
|
||||
corpus_url = "http://static.decontextualize.com/gutenberg-poetry-v001.ndjson.gz"
|
||||
corpus_file = Path("corpus.gz")
|
||||
corpus_length = 3085117
|
||||
|
||||
def download_corpus():
|
||||
"Downloads a few million lines of poetry"
|
||||
if not corpus_file.exists():
|
||||
print("Downloading a file with three million lines of poetry...")
|
||||
download = requests.get(corpus_url, stream=True)
|
||||
with corpus_file.open("wb") as f:
|
||||
for chunk in download.iter_content(chunk_size=1024):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
f.flush()
|
||||
|
||||
def read_corpus(limit=None):
|
||||
"Reads one line at a time from the corpus"
|
||||
for i, line in enumerate(gzip.open(corpus_file)):
|
||||
if limit and i == limit:
|
||||
break
|
||||
yield json.loads(line.strip())
|
||||
|
||||
def clean(line):
|
||||
"Returns lowercase text without any punctuation"
|
||||
return re.sub('[^a-z ]', '', line.lower()).strip()
|
||||
|
||||
def get_last_word(line):
|
||||
"""Gets the last word from a line.
|
||||
Strips out punctuation, then splits the line on spaces and returns the final word.
|
||||
"""
|
||||
parts = clean(line).split()
|
||||
if parts:
|
||||
return parts[-1]
|
||||
|
||||
def import_poems(limit=None):
|
||||
"Imports each line into the app"
|
||||
database_file = Path("database.sqlite")
|
||||
if database_file.exists():
|
||||
database_file.unlink()
|
||||
setup_django()
|
||||
from app.models import Line, Poem, Rhyme
|
||||
print("Importing lines into the app's database...")
|
||||
line_counts = defaultdict(int)
|
||||
for row in tqdm(read_corpus(limit=limit), total=limit or corpus_length):
|
||||
text = row['s']
|
||||
last_word = get_last_word(text)
|
||||
if not last_word:
|
||||
continue
|
||||
last_word_phones = pronouncing.phones_for_word(last_word)
|
||||
if last_word_phones:
|
||||
rhyming_phones = pronouncing.rhyming_part(last_word_phones[0])
|
||||
rhyme, _ = Rhyme.objects.get_or_create(phones=rhyming_phones)
|
||||
else:
|
||||
rhyme=None
|
||||
poem, _ = Poem.objects.get_or_create(gutenberg_id=row['gid'])
|
||||
line = Line.objects.create(
|
||||
text=text,
|
||||
clean_text=clean(text),
|
||||
line_number=line_counts[row['gid']],
|
||||
poem=poem,
|
||||
rhyme=rhyme,
|
||||
)
|
||||
line_counts[row['gid']] += 1
|
||||
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument('-l', '--limit', type=int)
|
||||
args = parser.parse_args()
|
||||
download_corpus()
|
||||
import_poems(args.limit)
|
Reference in New Issue
Block a user