lab_tinylm/tlm/cli.py

import click
from .model import TinyLanguageModel
from .helpers import read_mail_text, join
from .tokenization import tokenize_text, tokenize_words


@click.group()
def cli():
    """TinyLM - A simple n-gram language model."""
    pass


@cli.command()
@click.option('-l', "--length", default=50, help="Number of tokens to generate.")
@click.option('-n', "--context-window-words", default=2, help="Number of words in the context window.")
@click.option('-f', "--filepath", type=click.Path(exists=True), multiple=True, help="Text file(s) to use as training corpus. Can be specified multiple times.")
@click.option('-g', "--gutenberg", multiple=True, help="NLTK Gutenberg corpus key(s). Can be specified multiple times.")
@click.option('-G', "--list-gutenberg", is_flag=True, help="List available Gutenberg corpus keys.")
@click.option('-m', "--mbox", type=click.Path(exists=True), help="Mbox file to use for training.")
@click.option('-p', "--prompt", help="Prompt to start generation.")
@click.option('-i', "--interact", is_flag=True, help="Drop into interactive shell after generating.")
@click.option('-t', "--tokenize", 'tokenize_opts', multiple=True, type=click.Choice(['lower', 'char', 'alpha']), help="Preprocessing option (can be specified multiple times). 'lower': lowercase all input text. 'char': use characters as tokens instead of words.")
@click.option('-v', "--verbose", is_flag=True, help="Display step-by-step generation as a table.")
def generate(length, context_window_words, filepath, gutenberg, list_gutenberg, mbox, prompt, interact, tokenize_opts, verbose):
    """Generate text using the language model."""
    import nltk

    # Handle --list-gutenberg: list available keys
    if list_gutenberg:
        nltk.download("gutenberg", quiet=True)
        from nltk.corpus import gutenberg as gutenberg_corpus
        click.echo("Available Gutenberg corpus keys:")
        for key in gutenberg_corpus.fileids():
            click.echo(f"  {key}")
        return

    # Determine training corpus
    corpus = []

    if filepath:
        for fp in filepath:
            with open(fp, "r") as f:
                corpus.extend(tokenize_text(f.read(), tokenize_opts))
    if gutenberg:
        nltk.download("gutenberg", quiet=True)
        from nltk.corpus import gutenberg as gutenberg_corpus
        for key in gutenberg:
            corpus.extend(tokenize_words(gutenberg_corpus.words(key), tokenize_opts))
    if mbox:
        mail_text = read_mail_text(mbox)
        corpus.extend(tokenize_text(mail_text, tokenize_opts))

    if not corpus:
        raise click.UsageError("No training data provided. Must specify at least one of --filepath, --gutenberg, or --mbox.")

    # Train and generate
    model = TinyLanguageModel(n=context_window_words)
    model.train(corpus)
    if prompt:
        prompt_tokens = tokenize_text(prompt, tokenize_opts)
    else:
        prompt_tokens = None
    join_fn = ''.join if 'char' in tokenize_opts else None
    display_join = join_fn or join

    if verbose:
        from tabulate import tabulate
        rows = []
        import textwrap
        def step_callback(pattern, options, chosen):
            opts = textwrap.fill(', '.join(sorted(set(options))), width=60)
            rows.append([display_join(list(pattern)), opts, chosen])
        output = model.generate(length, prompt=prompt_tokens, join_fn=join_fn, step_callback=step_callback)
        click.echo(tabulate(rows, headers=["Context", "Options", "Selected"], tablefmt="simple"))
        click.echo()

    else:
        output = model.generate(length, prompt=prompt_tokens, join_fn=join_fn)

    click.echo(output)

    if interact:
        import code
        code.interact(local=locals(), banner="Entering interactive shell. 'model' and 'output' are available.")


if __name__ == "__main__":
    cli()