generated from mwc/lab_server
	
		
			
				
	
	
		
			82 lines
		
	
	
		
			2.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			82 lines
		
	
	
		
			2.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import os
 | 
						|
import json
 | 
						|
import re
 | 
						|
from tqdm import tqdm
 | 
						|
from banjo.runner import setup_django
 | 
						|
import requests
 | 
						|
from pathlib import Path
 | 
						|
from collections import defaultdict
 | 
						|
from argparse import ArgumentParser
 | 
						|
import pronouncing
 | 
						|
import gzip
 | 
						|
 | 
						|
corpus_url = "http://static.decontextualize.com/gutenberg-poetry-v001.ndjson.gz"
 | 
						|
corpus_file = Path("corpus.gz")
 | 
						|
corpus_length = 3085117
 | 
						|
 | 
						|
def download_corpus():
 | 
						|
    "Downloads a few million lines of poetry"
 | 
						|
    if not corpus_file.exists():
 | 
						|
        print("Downloading a file with three million lines of poetry...")
 | 
						|
        download = requests.get(corpus_url, stream=True)
 | 
						|
        with corpus_file.open("wb") as f:
 | 
						|
            for chunk in download.iter_content(chunk_size=1024):
 | 
						|
                if chunk:
 | 
						|
                    f.write(chunk)
 | 
						|
                    f.flush()
 | 
						|
    
 | 
						|
def read_corpus(limit=None):
 | 
						|
    "Reads one line at a time from the corpus"
 | 
						|
    for i, line in enumerate(gzip.open(corpus_file)):
 | 
						|
        if limit and i == limit:
 | 
						|
            break
 | 
						|
        yield json.loads(line.strip())
 | 
						|
 | 
						|
def clean(line):
 | 
						|
    "Returns lowercase text without any punctuation"
 | 
						|
    return re.sub('[^a-z ]', '', line.lower()).strip()
 | 
						|
 | 
						|
def get_last_word(line):
 | 
						|
    """Gets the last word from a line.
 | 
						|
    Strips out punctuation, then splits the line on spaces and returns the final word.
 | 
						|
    """
 | 
						|
    parts = clean(line).split()
 | 
						|
    if parts:
 | 
						|
        return parts[-1]
 | 
						|
 | 
						|
def import_poems(limit=None):
 | 
						|
    "Imports each line into the app"
 | 
						|
    database_file = Path("database.sqlite")
 | 
						|
    if database_file.exists():
 | 
						|
        database_file.unlink()
 | 
						|
    setup_django()
 | 
						|
    from app.models import Line, Poem, Rhyme
 | 
						|
    print("Importing lines into the app's database...")
 | 
						|
    line_counts = defaultdict(int)
 | 
						|
    for row in tqdm(read_corpus(limit=limit), total=limit or corpus_length):
 | 
						|
        text = row['s']
 | 
						|
        last_word = get_last_word(text)
 | 
						|
        if not last_word:
 | 
						|
            continue
 | 
						|
        last_word_phones = pronouncing.phones_for_word(last_word)
 | 
						|
        if last_word_phones:
 | 
						|
            rhyming_phones = pronouncing.rhyming_part(last_word_phones[0])
 | 
						|
            rhyme, _ = Rhyme.objects.get_or_create(phones=rhyming_phones)
 | 
						|
        else:
 | 
						|
            rhyme=None
 | 
						|
        poem, _ = Poem.objects.get_or_create(gutenberg_id=row['gid'])
 | 
						|
        line = Line.objects.create(
 | 
						|
            text=text,
 | 
						|
            clean_text=clean(text),
 | 
						|
            line_number=line_counts[row['gid']],
 | 
						|
            poem=poem,
 | 
						|
            rhyme=rhyme,
 | 
						|
        )
 | 
						|
        line_counts[row['gid']] += 1
 | 
						|
 | 
						|
parser = ArgumentParser()
 | 
						|
parser.add_argument('-l', '--limit', type=int)
 | 
						|
args = parser.parse_args()
 | 
						|
download_corpus()
 | 
						|
import_poems(args.limit)
 |