Allow combined corpora
This commit is contained in:
15
tlm/cli.py
15
tlm/cli.py
@@ -32,24 +32,23 @@ def generate(length, n, text, gutenberg, list_gutenberg, mbox, prompt, interact)
|
||||
return
|
||||
|
||||
# Determine training corpus
|
||||
corpus = None
|
||||
corpus = []
|
||||
|
||||
if text:
|
||||
corpus = []
|
||||
for filepath in text:
|
||||
with open(filepath, "r") as f:
|
||||
corpus.extend(f.read().split())
|
||||
elif gutenberg:
|
||||
if gutenberg:
|
||||
nltk.download("gutenberg", quiet=True)
|
||||
from nltk.corpus import gutenberg as gutenberg_corpus
|
||||
corpus = []
|
||||
for key in gutenberg:
|
||||
corpus.extend(gutenberg_corpus.words(key))
|
||||
elif mbox:
|
||||
if mbox:
|
||||
mail_text = read_mail_text(mbox)
|
||||
corpus = mail_text.split()
|
||||
else:
|
||||
raise click.UsageError("Must specify one of --text, --gutenberg, or --mbox for training data.")
|
||||
corpus.extend(mail_text.split())
|
||||
|
||||
if not corpus:
|
||||
raise click.UsageError("Must specify at least one of --text, --gutenberg, or --mbox for training data.")
|
||||
|
||||
# Train and generate
|
||||
model = TinyLanguageModel(n=n)
|
||||
|
||||
Reference in New Issue
Block a user