initial commit

2024-03-29 19:39:37 -04:00
commit ddf5786bdc
9 changed files with 954 additions and 0 deletions
--- a/poem_server/app/models.py
+++ b/poem_server/app/models.py
@@ -0,0 +1,51 @@
+from banjo.models import Model, StringField, IntegerField, ForeignKey
+from banjo.http import BadRequest
+import pronouncing
+import re
+
+class Line(Model):
+    text = StringField()
+    clean_text = StringField()
+    line_number = IntegerField()
+    poem = ForeignKey("Poem", related_name="lines")
+    rhyme = ForeignKey("Rhyme", related_name="lines", null=True)
+
+    def __str__(self):
+        "The string representation of a line is just its text"
+        return self.text
+
+        "Returns lowercase text without any punctuation"
+        return re.sub('[^a-z ]', '', self.text.lower())
+
+    def last_word(self):
+        "Gets the last word from a line, lower case and without punctuation."
+        parts = self.clean_text.split()
+        if parts:
+            return parts[-1]
+
+    def rhyming_lines(self):
+        if self.rhyme:
+            return self.rhyme.lines.exclude(clean_text__endswith=self.last_word())
+        else:
+            return []
+
+class Poem(Model):
+    gutenberg_id = IntegerField(unique=True)
+
+    def __str__(self):
+        "The string representation of a poem is all its lines, in order"
+        return '\n'.join(line.text for line in self.lines.order_by('line_number'))
+
+class Rhyme(Model):
+    phones = StringField()
+
+    @classmethod
+    def get_rhyme_for_word(self, word):
+        phones = pronouncing.phones_for_word(word)
+        if not phones:
+            raise BadRequest(f"Couldn't figure out how to pronounce {word}")
+        rhyming_phones = pronouncing.rhyming_part(phones[0])
+        try:
+            return Rhyme.objects.get(phones=rhyming_phones)
+        except Rhyme.DoesNotExist:
+            raise BadRequest(f"Sorry, no lines rhyme with {word}")
--- a/poem_server/app/views.py
+++ b/poem_server/app/views.py
@@ -0,0 +1,8 @@
+from banjo.urls import route_get, route_post
+from banjo.http import BadRequest, NotFound
+from app.models import Line, Poem, Rhyme
+from random import choice, sample
+
+@route_get('lines/random', args={})
+def get_random_line(params):
+    return {'line': Line.objects.random().text}
--- a/poem_server/import_poems.py
+++ b/poem_server/import_poems.py
@@ -0,0 +1,81 @@
+import os
+import json
+import re
+from tqdm import tqdm
+from banjo.runner import setup_django
+import requests
+from pathlib import Path
+from collections import defaultdict
+from argparse import ArgumentParser
+import pronouncing
+import gzip
+
+corpus_url = "http://static.decontextualize.com/gutenberg-poetry-v001.ndjson.gz"
+corpus_file = Path("corpus.gz")
+corpus_length = 3085117
+
+def download_corpus():
+    "Downloads a few million lines of poetry"
+    if not corpus_file.exists():
+        print("Downloading a file with three million lines of poetry...")
+        download = requests.get(corpus_url, stream=True)
+        with corpus_file.open("wb") as f:
+            for chunk in download.iter_content(chunk_size=1024):
+                if chunk:
+                    f.write(chunk)
+                    f.flush()
+    
+def read_corpus(limit=None):
+    "Reads one line at a time from the corpus"
+    for i, line in enumerate(gzip.open(corpus_file)):
+        if limit and i == limit:
+            break
+        yield json.loads(line.strip())
+
+def clean(line):
+    "Returns lowercase text without any punctuation"
+    return re.sub('[^a-z ]', '', line.lower()).strip()
+
+def get_last_word(line):
+    """Gets the last word from a line.
+    Strips out punctuation, then splits the line on spaces and returns the final word.
+    """
+    parts = clean(line).split()
+    if parts:
+        return parts[-1]
+
+def import_poems(limit=None):
+    "Imports each line into the app"
+    database_file = Path("database.sqlite")
+    if database_file.exists():
+        database_file.unlink()
+    setup_django()
+    from app.models import Line, Poem, Rhyme
+    print("Importing lines into the app's database...")
+    line_counts = defaultdict(int)
+    for row in tqdm(read_corpus(limit=limit), total=limit or corpus_length):
+        text = row['s']
+        last_word = get_last_word(text)
+        if not last_word:
+            continue
+        last_word_phones = pronouncing.phones_for_word(last_word)
+        if last_word_phones:
+            rhyming_phones = pronouncing.rhyming_part(last_word_phones[0])
+            rhyme, _ = Rhyme.objects.get_or_create(phones=rhyming_phones)
+        else:
+            rhyme=None
+        poem, _ = Poem.objects.get_or_create(gutenberg_id=row['gid'])
+        line = Line.objects.create(
+            text=text,
+            clean_text=clean(text),
+            line_number=line_counts[row['gid']],
+            poem=poem,
+            rhyme=rhyme,
+        )
+        line_counts[row['gid']] += 1
+
+parser = ArgumentParser()
+parser.add_argument('-l', '--limit', type=int)
+args = parser.parse_args()
+download_corpus()
+import_poems(args.limit)