Initial commit

2026-02-16 20:09:01 +00:00
commit d0cdebd258
10 changed files with 416 additions and 0 deletions
--- a/tlm/cli.py
+++ b/tlm/cli.py
@@ -0,0 +1,66 @@
+import click
+from .model import TinyLanguageModel
+from .helpers import read_mail_text
+
+
+@click.group()
+def cli():
+    """TinyLM - A simple n-gram language model."""
+    pass
+
+
+@cli.command()
+@click.option("--length", default=50, help="Number of words to generate.")
+@click.option("--n", default=2, help="Number of words in the context window.")
+@click.option("--text", type=click.Path(exists=True), multiple=True, help="Text file(s) to use as training corpus. Can be specified multiple times.")
+@click.option("--gutenberg", multiple=True, help="NLTK Gutenberg corpus key(s). Can be specified multiple times.")
+@click.option("--list-gutenberg", is_flag=True, help="List available Gutenberg corpus keys.")
+@click.option("--mbox", type=click.Path(exists=True), help="Mbox file to use for training.")
+@click.option("--prompt", help="Prompt to start generation.")
+@click.option("--interact", is_flag=True, help="Drop into interactive shell after generating.")
+def generate(length, n, text, gutenberg, list_gutenberg, mbox, prompt, interact):
+    """Generate text using the language model."""
+    import nltk
+
+    # Handle --list-gutenberg: list available keys
+    if list_gutenberg:
+        nltk.download("gutenberg", quiet=True)
+        from nltk.corpus import gutenberg as gutenberg_corpus
+        click.echo("Available Gutenberg corpus keys:")
+        for key in gutenberg_corpus.fileids():
+            click.echo(f"  {key}")
+        return
+
+    # Determine training corpus
+    corpus = []
+
+    if text:
+        for filepath in text:
+            with open(filepath, "r") as f:
+                corpus.extend(f.read().split())
+    if gutenberg:
+        nltk.download("gutenberg", quiet=True)
+        from nltk.corpus import gutenberg as gutenberg_corpus
+        for key in gutenberg:
+            corpus.extend(gutenberg_corpus.words(key))
+    if mbox:
+        mail_text = read_mail_text(mbox)
+        corpus.extend(mail_text.split())
+
+    if not corpus:
+        raise click.UsageError("Must specify at least one of --text, --gutenberg, or --mbox for training data.")
+
+    # Train and generate
+    model = TinyLanguageModel(n=n)
+    model.train(corpus)
+    prompt_words = prompt.split() if prompt else None
+    output = model.generate(length, prompt=prompt_words)
+    click.echo(output)
+
+    if interact:
+        import code
+        code.interact(local=locals(), banner="Entering interactive shell. 'model' and 'output' are available.")
+
+
+if __name__ == "__main__":
+    cli()