From f372786dbccdb047be6910b9bd59cc523206e451 Mon Sep 17 00:00:00 2001 From: chris Date: Mon, 9 Feb 2026 12:35:23 -0500 Subject: [PATCH] Allow combined corpora --- tlm/cli.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tlm/cli.py b/tlm/cli.py index c929050..d1ff2d5 100644 --- a/tlm/cli.py +++ b/tlm/cli.py @@ -32,24 +32,23 @@ def generate(length, n, text, gutenberg, list_gutenberg, mbox, prompt, interact) return # Determine training corpus - corpus = None + corpus = [] if text: - corpus = [] for filepath in text: with open(filepath, "r") as f: corpus.extend(f.read().split()) - elif gutenberg: + if gutenberg: nltk.download("gutenberg", quiet=True) from nltk.corpus import gutenberg as gutenberg_corpus - corpus = [] for key in gutenberg: corpus.extend(gutenberg_corpus.words(key)) - elif mbox: + if mbox: mail_text = read_mail_text(mbox) - corpus = mail_text.split() - else: - raise click.UsageError("Must specify one of --text, --gutenberg, or --mbox for training data.") + corpus.extend(mail_text.split()) + + if not corpus: + raise click.UsageError("Must specify at least one of --text, --gutenberg, or --mbox for training data.") # Train and generate model = TinyLanguageModel(n=n)