initial commit

This commit is contained in:
Chris Proctor
2026-03-09 12:28:21 -04:00
commit 039a467a9f
13 changed files with 841 additions and 0 deletions

0
tlm/__init__.py Normal file
View File

106
tlm/cli.py Normal file
View File

@@ -0,0 +1,106 @@
import click
from .model import TinyLanguageModel
from .helpers import read_mail_text, join
from .tokenization import tokenize_text, tokenize_words
@click.group()
def cli():
"""TinyLM - A language model with learned word embeddings."""
pass
@cli.command()
@click.option('-n', "--context-window-words", default=None, type=int, help="Number of words in the context window. (default: 2)")
@click.option('-d', "--embedding-dim", default=None, type=int, help="Dimension of the word embeddings. (default: 32)")
@click.option('-e', "--epochs", default=5, show_default=True, help="Number of training epochs.")
@click.option('-r', "--learning-rate", default=0.05, show_default=True, help="Learning rate for gradient descent.")
@click.option('-f', "--filepath", type=click.Path(exists=True), multiple=True, help="Text file(s) to train on. Can be specified multiple times.")
@click.option('-g', "--gutenberg", multiple=True, help="NLTK Gutenberg corpus key(s). Can be specified multiple times.")
@click.option('-G', "--list-gutenberg", is_flag=True, help="List available Gutenberg corpus keys.")
@click.option('-m', "--mbox", type=click.Path(exists=True), help="Mbox file to train on.")
@click.option('-o', "--output", default="model.json", show_default=True, help="File path to save the trained model.")
@click.option('-R', "--resume", 'resume_path', type=click.Path(exists=True), default=None, help="Load a saved model and continue training it.")
@click.option('-t', "--tokenize", 'tokenize_opts', multiple=True, type=click.Choice(['lower', 'char', 'alpha']), help="Tokenization options (can be specified multiple times).")
def train(context_window_words, embedding_dim, epochs, learning_rate, filepath, gutenberg, list_gutenberg, mbox, output, resume_path, tokenize_opts):
"""Train a language model on a corpus and save it to a file."""
import nltk
if list_gutenberg:
nltk.download("gutenberg", quiet=True)
from nltk.corpus import gutenberg as gutenberg_corpus
click.echo("Available Gutenberg corpus keys:")
for key in gutenberg_corpus.fileids():
click.echo(f" {key}")
return
corpus = []
if filepath:
for fp in filepath:
with open(fp) as f:
corpus.extend(tokenize_text(f.read(), tokenize_opts))
if gutenberg:
nltk.download("gutenberg", quiet=True)
from nltk.corpus import gutenberg as gutenberg_corpus
for key in gutenberg:
corpus.extend(tokenize_words(gutenberg_corpus.words(key), tokenize_opts))
if mbox:
corpus.extend(tokenize_text(read_mail_text(mbox), tokenize_opts))
if not corpus:
raise click.UsageError("No training data provided. Use --filepath, --gutenberg, or --mbox.")
if resume_path:
if context_window_words is not None or embedding_dim is not None:
raise click.UsageError("-n/--context-window-words and -d/--embedding-dim are ignored when resuming a saved model. Remove these options.")
model = TinyLanguageModel()
model.load(resume_path)
model.train(corpus, epochs=epochs, lr=learning_rate, resume=True)
else:
model = TinyLanguageModel(n=context_window_words or 2, embedding_dim=embedding_dim or 32)
model.train(corpus, epochs=epochs, lr=learning_rate)
model.save(output)
click.echo(f"Model saved to {output}")
@cli.command()
@click.option('-m', "--model", 'model_path', required=True, type=click.Path(exists=True), help="Trained model file to load.")
@click.option('-l', "--length", default=50, show_default=True, help="Number of tokens to generate.")
@click.option('-p', "--prompt", help="Prompt to start generation.")
@click.option('-v', "--verbose", is_flag=True, help="Display step-by-step generation as a table.")
@click.option('-i', "--interact", is_flag=True, help="Drop into interactive shell after generating.")
@click.option('-t', "--tokenize", 'tokenize_opts', multiple=True, type=click.Choice(['lower', 'char', 'alpha']), help="Tokenization options for the prompt.")
def generate(model_path, length, prompt, verbose, interact, tokenize_opts):
"""Generate text using a trained model."""
model = TinyLanguageModel()
model.load(model_path)
join_fn = ''.join if 'char' in tokenize_opts else None
display_join = join_fn or join
prompt_tokens = tokenize_text(prompt, tokenize_opts) if prompt else None
if verbose:
from tabulate import tabulate
import textwrap
rows = []
def step_callback(context, options, chosen):
opts = textwrap.fill(', '.join(options), width=60)
rows.append([display_join(list(context)), opts, chosen])
output = model.generate(length, prompt=prompt_tokens, join_fn=join_fn, step_callback=step_callback)
click.echo(tabulate(rows, headers=["Context", "Options", "Selected"], tablefmt="simple"))
click.echo()
else:
output = model.generate(length, prompt=prompt_tokens, join_fn=join_fn)
click.echo(output)
if interact:
import code
code.interact(local=locals(), banner="Entering interactive shell. 'model' and 'output' are available.")
if __name__ == "__main__":
cli()

61
tlm/helpers.py Normal file
View File

@@ -0,0 +1,61 @@
import mailbox
import email
from email.policy import default
from tqdm import tqdm
import numpy as np
def rolling_window(iterable, n):
"""Passes a rolling window over the iterable, yielding each n-length tuple.
rolling_window(range(5), 3) -> (0, 1, 2), (1, 2, 3), (2, 3, 4)
"""
it = iter(iterable)
try:
window = [next(it) for _ in range(n)]
while True:
yield tuple(window)
window = window[1:] + [next(it)]
except StopIteration:
return
def read_mail_text(mbox_path):
"""
Extract and concatenate all plaintext content from an mbox file.
"""
texts = []
mbox = mailbox.mbox(
mbox_path,
factory=lambda f: email.message_from_binary_file(f, policy=default)
)
for msg in tqdm(mbox):
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain":
try:
text = part.get_content()
if text:
texts.append(text.strip())
except Exception:
pass
else:
if msg.get_content_type() == "text/plain":
try:
text = msg.get_content()
if text:
texts.append(text.strip())
except Exception:
pass
return "\n\n".join(texts)
def join(tokens, punctuation=".,?!:;'\""):
"Joins text, but does not give extra space before punctuation."
tokens = [t if t in punctuation else ' ' + t for t in tokens]
return ''.join(tokens).strip()
def softmax(x):
"Convert a vector of scores (logits) into a probability distribution."
e = np.exp(x - x.max())
return e / e.sum()

170
tlm/model.py Normal file
View File

@@ -0,0 +1,170 @@
import json
import random
import numpy as np
from .helpers import rolling_window, join, softmax
class TinyLanguageModel:
"""
A language model that uses learned embeddings to predict the next word.
Instead of counting how often each word follows each context (as in the
matrix model), this model learns dense vector representations of words,
called embeddings, and uses them to estimate the probability of the next word.
The model has two learned matrices:
E: (vocab_size x embedding_dim) -- one row per word in the vocabulary,
representing that word as a dense vector
W: (embedding_dim x vocab_size) -- maps an embedding to one score per word
To predict the next word, the model:
1. Looks up the embedding E[i] for each word i in the context window.
2. Averages those embeddings to get a single context vector.
3. Computes logits = context_vector @ W (one score per vocabulary word).
4. Applies softmax to turn scores into probabilities.
5. Samples the next word from those probabilities.
Training works by gradient descent: the model repeatedly sees (context, target)
pairs, makes a prediction, computes how wrong it was (cross-entropy loss),
and nudges E and W in the direction that reduces the loss.
"""
def __init__(self, n=2, embedding_dim=32):
"Create a new model with context window of n words and given embedding size."
self.n = n
self.embedding_dim = embedding_dim
self.vocab = None
self.word_to_idx = None
self.E = None # input embeddings: (vocab_size, embedding_dim)
self.W = None # output weights: (embedding_dim, vocab_size)
self.b = None # output bias: (vocab_size,)
def train(self, words, epochs=5, lr=0.05, resume=False):
"""Learn word embeddings from a list of words using gradient descent.
If resume=True, the existing vocabulary and learned matrices (E, W, b) are
kept and training continues from where it left off. Words in the corpus that
are not in the saved vocabulary are skipped with a warning.
"""
if resume:
oov = sorted({w for w in words if w not in self.word_to_idx})
if oov:
print(f"Warning: {len(oov)} word(s) not in saved vocabulary will be skipped: {oov[:10]}{'...' if len(oov) > 10 else ''}")
else:
self.vocab = sorted(set(words))
self.word_to_idx = {word: idx for idx, word in enumerate(self.vocab)}
self.initialize_matrices()
training_data = [
(window[:-1], window[-1])
for window in rolling_window(words, self.n + 1)
if all(w in self.word_to_idx for w in window)
]
from tqdm import tqdm
for epoch in range(epochs):
total_loss = 0.0
random.shuffle(training_data)
for context, target in tqdm(training_data, desc=f"Epoch {epoch + 1}/{epochs}", leave=False):
total_loss += self._step(context, target, lr)
print(f"Epoch {epoch + 1}/{epochs} loss={total_loss / len(training_data):.4f}")
def initialize_matrices(self):
"Randomly initializes embedding, weight, and bias matrices"
vocab_size = len(self.vocab)
rng = np.random.default_rng(42)
self.E = rng.standard_normal((vocab_size, self.embedding_dim)) * 0.01
self.W = rng.standard_normal((self.embedding_dim, vocab_size)) * 0.01
self.b = np.zeros(vocab_size)
def _context_embedding(self, context):
"Average the embeddings of the words in the context window."
indices = [self.word_to_idx[w] for w in context]
return self.E[indices].mean(axis=0)
def _forward(self, context):
"Return (context_emb, probs) for a given context tuple."
ctx_emb = self._context_embedding(context)
logits = ctx_emb @ self.W + self.b
probs = softmax(logits)
return ctx_emb, probs
def _step(self, context, target, lr):
"One gradient-descent step on a single (context, target) pair. Returns the loss."
ctx_emb, probs = self._forward(context)
target_idx = self.word_to_idx[target]
loss = -np.log(probs[target_idx] + 1e-12)
# Gradient of cross-entropy loss w.r.t. logits: probs with 1 subtracted at target
d_logits = probs.copy()
d_logits[target_idx] -= 1.0
# Gradients for output weights and bias
d_W = ctx_emb[:, None] @ d_logits[None, :] # (embedding_dim, vocab_size)
d_b = d_logits # (vocab_size,)
# Gradient flows back through averaging to each context word's embedding
d_ctx_emb = self.W @ d_logits # (embedding_dim,)
d_per_word = d_ctx_emb / len(context)
for idx in [self.word_to_idx[w] for w in context]:
self.E[idx] -= lr * d_per_word
self.W -= lr * d_W
self.b -= lr * d_b
return loss
def generate(self, length, prompt=None, join_fn=None, step_callback=None):
"Create new text using the learned embeddings."
if self.E is None:
raise Exception("The model has not been trained.")
output = list(prompt or self.get_random_prompt())
# Drop any prompt tokens not in vocabulary
output = [w for w in output if w in self.word_to_idx]
if len(output) < self.n:
output = list(self.get_random_prompt())
while len(output) < length:
context = tuple(output[-self.n:])
_, probs = self._forward(context)
chosen_idx = np.random.choice(len(self.vocab), p=probs)
chosen_word = self.vocab[chosen_idx]
if step_callback:
top_indices = np.argsort(probs)[-10:][::-1]
top_words = [self.vocab[i] for i in top_indices if probs[i] > 0.001]
step_callback(context, top_words, chosen_word)
output.append(chosen_word)
return (join_fn or join)(output)
def get_random_prompt(self):
"Return a random context window drawn from the vocabulary."
return tuple(random.choices(self.vocab, k=self.n))
def save(self, filepath):
"Save the model to a JSON file."
model_data = {
"n": self.n,
"embedding_dim": self.embedding_dim,
"vocab": self.vocab,
"E": self.E.tolist(),
"W": self.W.tolist(),
"b": self.b.tolist(),
}
with open(filepath, "w") as f:
json.dump(model_data, f)
def load(self, filepath):
"Load a model from a JSON file."
with open(filepath) as f:
data = json.load(f)
self.n = data["n"]
self.embedding_dim = data["embedding_dim"]
self.vocab = data["vocab"]
self.word_to_idx = {word: idx for idx, word in enumerate(self.vocab)}
self.E = np.array(data["E"])
self.W = np.array(data["W"])
self.b = np.array(data["b"])

31
tlm/tokenization.py Normal file
View File

@@ -0,0 +1,31 @@
def compress_whitespace(text):
"""Collapse sequences of whitespace into a single space."""
import re
return re.sub(r'\s+', ' ', text).strip()
def _is_alpha_token(token):
return all(c.isalpha() or c in " '" for c in token)
def tokenize_text(text, options):
"""Tokenize a raw text string according to the given options."""
if 'lower' in options:
text = text.lower()
words = text.split()
if 'alpha' in options:
words = [w for w in words if _is_alpha_token(w)]
if 'char' in options:
return list(compress_whitespace(' '.join(words)))
return words
def tokenize_words(words, options):
"""Apply tokenization options to an already word-tokenized sequence."""
if 'lower' in options:
words = [w.lower() for w in words]
if 'alpha' in options:
words = [w for w in words if _is_alpha_token(w)]
if 'char' in options:
return list(compress_whitespace(' '.join(words)))
return list(words)