initial commit
This commit is contained in:
0
tlm/__init__.py
Normal file
0
tlm/__init__.py
Normal file
106
tlm/cli.py
Normal file
106
tlm/cli.py
Normal file
@@ -0,0 +1,106 @@
|
||||
import click
|
||||
from .model import TinyLanguageModel
|
||||
from .helpers import read_mail_text, join
|
||||
from .tokenization import tokenize_text, tokenize_words
|
||||
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
"""TinyLM - A language model with learned word embeddings."""
|
||||
pass
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option('-n', "--context-window-words", default=None, type=int, help="Number of words in the context window. (default: 2)")
|
||||
@click.option('-d', "--embedding-dim", default=None, type=int, help="Dimension of the word embeddings. (default: 32)")
|
||||
@click.option('-e', "--epochs", default=5, show_default=True, help="Number of training epochs.")
|
||||
@click.option('-r', "--learning-rate", default=0.05, show_default=True, help="Learning rate for gradient descent.")
|
||||
@click.option('-f', "--filepath", type=click.Path(exists=True), multiple=True, help="Text file(s) to train on. Can be specified multiple times.")
|
||||
@click.option('-g', "--gutenberg", multiple=True, help="NLTK Gutenberg corpus key(s). Can be specified multiple times.")
|
||||
@click.option('-G', "--list-gutenberg", is_flag=True, help="List available Gutenberg corpus keys.")
|
||||
@click.option('-m', "--mbox", type=click.Path(exists=True), help="Mbox file to train on.")
|
||||
@click.option('-o', "--output", default="model.json", show_default=True, help="File path to save the trained model.")
|
||||
@click.option('-R', "--resume", 'resume_path', type=click.Path(exists=True), default=None, help="Load a saved model and continue training it.")
|
||||
@click.option('-t', "--tokenize", 'tokenize_opts', multiple=True, type=click.Choice(['lower', 'char', 'alpha']), help="Tokenization options (can be specified multiple times).")
|
||||
def train(context_window_words, embedding_dim, epochs, learning_rate, filepath, gutenberg, list_gutenberg, mbox, output, resume_path, tokenize_opts):
|
||||
"""Train a language model on a corpus and save it to a file."""
|
||||
import nltk
|
||||
|
||||
if list_gutenberg:
|
||||
nltk.download("gutenberg", quiet=True)
|
||||
from nltk.corpus import gutenberg as gutenberg_corpus
|
||||
click.echo("Available Gutenberg corpus keys:")
|
||||
for key in gutenberg_corpus.fileids():
|
||||
click.echo(f" {key}")
|
||||
return
|
||||
|
||||
corpus = []
|
||||
if filepath:
|
||||
for fp in filepath:
|
||||
with open(fp) as f:
|
||||
corpus.extend(tokenize_text(f.read(), tokenize_opts))
|
||||
if gutenberg:
|
||||
nltk.download("gutenberg", quiet=True)
|
||||
from nltk.corpus import gutenberg as gutenberg_corpus
|
||||
for key in gutenberg:
|
||||
corpus.extend(tokenize_words(gutenberg_corpus.words(key), tokenize_opts))
|
||||
if mbox:
|
||||
corpus.extend(tokenize_text(read_mail_text(mbox), tokenize_opts))
|
||||
|
||||
if not corpus:
|
||||
raise click.UsageError("No training data provided. Use --filepath, --gutenberg, or --mbox.")
|
||||
|
||||
if resume_path:
|
||||
if context_window_words is not None or embedding_dim is not None:
|
||||
raise click.UsageError("-n/--context-window-words and -d/--embedding-dim are ignored when resuming a saved model. Remove these options.")
|
||||
model = TinyLanguageModel()
|
||||
model.load(resume_path)
|
||||
model.train(corpus, epochs=epochs, lr=learning_rate, resume=True)
|
||||
else:
|
||||
model = TinyLanguageModel(n=context_window_words or 2, embedding_dim=embedding_dim or 32)
|
||||
model.train(corpus, epochs=epochs, lr=learning_rate)
|
||||
model.save(output)
|
||||
click.echo(f"Model saved to {output}")
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option('-m', "--model", 'model_path', required=True, type=click.Path(exists=True), help="Trained model file to load.")
|
||||
@click.option('-l', "--length", default=50, show_default=True, help="Number of tokens to generate.")
|
||||
@click.option('-p', "--prompt", help="Prompt to start generation.")
|
||||
@click.option('-v', "--verbose", is_flag=True, help="Display step-by-step generation as a table.")
|
||||
@click.option('-i', "--interact", is_flag=True, help="Drop into interactive shell after generating.")
|
||||
@click.option('-t', "--tokenize", 'tokenize_opts', multiple=True, type=click.Choice(['lower', 'char', 'alpha']), help="Tokenization options for the prompt.")
|
||||
def generate(model_path, length, prompt, verbose, interact, tokenize_opts):
|
||||
"""Generate text using a trained model."""
|
||||
model = TinyLanguageModel()
|
||||
model.load(model_path)
|
||||
|
||||
join_fn = ''.join if 'char' in tokenize_opts else None
|
||||
display_join = join_fn or join
|
||||
|
||||
prompt_tokens = tokenize_text(prompt, tokenize_opts) if prompt else None
|
||||
|
||||
if verbose:
|
||||
from tabulate import tabulate
|
||||
import textwrap
|
||||
rows = []
|
||||
|
||||
def step_callback(context, options, chosen):
|
||||
opts = textwrap.fill(', '.join(options), width=60)
|
||||
rows.append([display_join(list(context)), opts, chosen])
|
||||
|
||||
output = model.generate(length, prompt=prompt_tokens, join_fn=join_fn, step_callback=step_callback)
|
||||
click.echo(tabulate(rows, headers=["Context", "Options", "Selected"], tablefmt="simple"))
|
||||
click.echo()
|
||||
else:
|
||||
output = model.generate(length, prompt=prompt_tokens, join_fn=join_fn)
|
||||
|
||||
click.echo(output)
|
||||
|
||||
if interact:
|
||||
import code
|
||||
code.interact(local=locals(), banner="Entering interactive shell. 'model' and 'output' are available.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
61
tlm/helpers.py
Normal file
61
tlm/helpers.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import mailbox
|
||||
import email
|
||||
from email.policy import default
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
|
||||
|
||||
def rolling_window(iterable, n):
|
||||
"""Passes a rolling window over the iterable, yielding each n-length tuple.
|
||||
rolling_window(range(5), 3) -> (0, 1, 2), (1, 2, 3), (2, 3, 4)
|
||||
"""
|
||||
it = iter(iterable)
|
||||
try:
|
||||
window = [next(it) for _ in range(n)]
|
||||
while True:
|
||||
yield tuple(window)
|
||||
window = window[1:] + [next(it)]
|
||||
except StopIteration:
|
||||
return
|
||||
|
||||
|
||||
def read_mail_text(mbox_path):
|
||||
"""
|
||||
Extract and concatenate all plaintext content from an mbox file.
|
||||
"""
|
||||
texts = []
|
||||
mbox = mailbox.mbox(
|
||||
mbox_path,
|
||||
factory=lambda f: email.message_from_binary_file(f, policy=default)
|
||||
)
|
||||
for msg in tqdm(mbox):
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
if part.get_content_type() == "text/plain":
|
||||
try:
|
||||
text = part.get_content()
|
||||
if text:
|
||||
texts.append(text.strip())
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
if msg.get_content_type() == "text/plain":
|
||||
try:
|
||||
text = msg.get_content()
|
||||
if text:
|
||||
texts.append(text.strip())
|
||||
except Exception:
|
||||
pass
|
||||
return "\n\n".join(texts)
|
||||
|
||||
|
||||
def join(tokens, punctuation=".,?!:;'\""):
|
||||
"Joins text, but does not give extra space before punctuation."
|
||||
tokens = [t if t in punctuation else ' ' + t for t in tokens]
|
||||
return ''.join(tokens).strip()
|
||||
|
||||
|
||||
def softmax(x):
|
||||
"Convert a vector of scores (logits) into a probability distribution."
|
||||
e = np.exp(x - x.max())
|
||||
return e / e.sum()
|
||||
170
tlm/model.py
Normal file
170
tlm/model.py
Normal file
@@ -0,0 +1,170 @@
|
||||
import json
|
||||
import random
|
||||
import numpy as np
|
||||
from .helpers import rolling_window, join, softmax
|
||||
|
||||
class TinyLanguageModel:
|
||||
"""
|
||||
A language model that uses learned embeddings to predict the next word.
|
||||
|
||||
Instead of counting how often each word follows each context (as in the
|
||||
matrix model), this model learns dense vector representations of words,
|
||||
called embeddings, and uses them to estimate the probability of the next word.
|
||||
|
||||
The model has two learned matrices:
|
||||
|
||||
E: (vocab_size x embedding_dim) -- one row per word in the vocabulary,
|
||||
representing that word as a dense vector
|
||||
W: (embedding_dim x vocab_size) -- maps an embedding to one score per word
|
||||
|
||||
To predict the next word, the model:
|
||||
1. Looks up the embedding E[i] for each word i in the context window.
|
||||
2. Averages those embeddings to get a single context vector.
|
||||
3. Computes logits = context_vector @ W (one score per vocabulary word).
|
||||
4. Applies softmax to turn scores into probabilities.
|
||||
5. Samples the next word from those probabilities.
|
||||
|
||||
Training works by gradient descent: the model repeatedly sees (context, target)
|
||||
pairs, makes a prediction, computes how wrong it was (cross-entropy loss),
|
||||
and nudges E and W in the direction that reduces the loss.
|
||||
"""
|
||||
|
||||
def __init__(self, n=2, embedding_dim=32):
|
||||
"Create a new model with context window of n words and given embedding size."
|
||||
self.n = n
|
||||
self.embedding_dim = embedding_dim
|
||||
self.vocab = None
|
||||
self.word_to_idx = None
|
||||
self.E = None # input embeddings: (vocab_size, embedding_dim)
|
||||
self.W = None # output weights: (embedding_dim, vocab_size)
|
||||
self.b = None # output bias: (vocab_size,)
|
||||
|
||||
def train(self, words, epochs=5, lr=0.05, resume=False):
|
||||
"""Learn word embeddings from a list of words using gradient descent.
|
||||
|
||||
If resume=True, the existing vocabulary and learned matrices (E, W, b) are
|
||||
kept and training continues from where it left off. Words in the corpus that
|
||||
are not in the saved vocabulary are skipped with a warning.
|
||||
"""
|
||||
if resume:
|
||||
oov = sorted({w for w in words if w not in self.word_to_idx})
|
||||
if oov:
|
||||
print(f"Warning: {len(oov)} word(s) not in saved vocabulary will be skipped: {oov[:10]}{'...' if len(oov) > 10 else ''}")
|
||||
else:
|
||||
self.vocab = sorted(set(words))
|
||||
self.word_to_idx = {word: idx for idx, word in enumerate(self.vocab)}
|
||||
self.initialize_matrices()
|
||||
|
||||
training_data = [
|
||||
(window[:-1], window[-1])
|
||||
for window in rolling_window(words, self.n + 1)
|
||||
if all(w in self.word_to_idx for w in window)
|
||||
]
|
||||
|
||||
from tqdm import tqdm
|
||||
for epoch in range(epochs):
|
||||
total_loss = 0.0
|
||||
random.shuffle(training_data)
|
||||
for context, target in tqdm(training_data, desc=f"Epoch {epoch + 1}/{epochs}", leave=False):
|
||||
total_loss += self._step(context, target, lr)
|
||||
print(f"Epoch {epoch + 1}/{epochs} loss={total_loss / len(training_data):.4f}")
|
||||
|
||||
def initialize_matrices(self):
|
||||
"Randomly initializes embedding, weight, and bias matrices"
|
||||
vocab_size = len(self.vocab)
|
||||
rng = np.random.default_rng(42)
|
||||
self.E = rng.standard_normal((vocab_size, self.embedding_dim)) * 0.01
|
||||
self.W = rng.standard_normal((self.embedding_dim, vocab_size)) * 0.01
|
||||
self.b = np.zeros(vocab_size)
|
||||
|
||||
def _context_embedding(self, context):
|
||||
"Average the embeddings of the words in the context window."
|
||||
indices = [self.word_to_idx[w] for w in context]
|
||||
return self.E[indices].mean(axis=0)
|
||||
|
||||
def _forward(self, context):
|
||||
"Return (context_emb, probs) for a given context tuple."
|
||||
ctx_emb = self._context_embedding(context)
|
||||
logits = ctx_emb @ self.W + self.b
|
||||
probs = softmax(logits)
|
||||
return ctx_emb, probs
|
||||
|
||||
def _step(self, context, target, lr):
|
||||
"One gradient-descent step on a single (context, target) pair. Returns the loss."
|
||||
ctx_emb, probs = self._forward(context)
|
||||
target_idx = self.word_to_idx[target]
|
||||
loss = -np.log(probs[target_idx] + 1e-12)
|
||||
|
||||
# Gradient of cross-entropy loss w.r.t. logits: probs with 1 subtracted at target
|
||||
d_logits = probs.copy()
|
||||
d_logits[target_idx] -= 1.0
|
||||
|
||||
# Gradients for output weights and bias
|
||||
d_W = ctx_emb[:, None] @ d_logits[None, :] # (embedding_dim, vocab_size)
|
||||
d_b = d_logits # (vocab_size,)
|
||||
|
||||
# Gradient flows back through averaging to each context word's embedding
|
||||
d_ctx_emb = self.W @ d_logits # (embedding_dim,)
|
||||
d_per_word = d_ctx_emb / len(context)
|
||||
for idx in [self.word_to_idx[w] for w in context]:
|
||||
self.E[idx] -= lr * d_per_word
|
||||
|
||||
self.W -= lr * d_W
|
||||
self.b -= lr * d_b
|
||||
|
||||
return loss
|
||||
|
||||
def generate(self, length, prompt=None, join_fn=None, step_callback=None):
|
||||
"Create new text using the learned embeddings."
|
||||
if self.E is None:
|
||||
raise Exception("The model has not been trained.")
|
||||
|
||||
output = list(prompt or self.get_random_prompt())
|
||||
# Drop any prompt tokens not in vocabulary
|
||||
output = [w for w in output if w in self.word_to_idx]
|
||||
if len(output) < self.n:
|
||||
output = list(self.get_random_prompt())
|
||||
|
||||
while len(output) < length:
|
||||
context = tuple(output[-self.n:])
|
||||
_, probs = self._forward(context)
|
||||
chosen_idx = np.random.choice(len(self.vocab), p=probs)
|
||||
chosen_word = self.vocab[chosen_idx]
|
||||
|
||||
if step_callback:
|
||||
top_indices = np.argsort(probs)[-10:][::-1]
|
||||
top_words = [self.vocab[i] for i in top_indices if probs[i] > 0.001]
|
||||
step_callback(context, top_words, chosen_word)
|
||||
|
||||
output.append(chosen_word)
|
||||
|
||||
return (join_fn or join)(output)
|
||||
|
||||
def get_random_prompt(self):
|
||||
"Return a random context window drawn from the vocabulary."
|
||||
return tuple(random.choices(self.vocab, k=self.n))
|
||||
|
||||
def save(self, filepath):
|
||||
"Save the model to a JSON file."
|
||||
model_data = {
|
||||
"n": self.n,
|
||||
"embedding_dim": self.embedding_dim,
|
||||
"vocab": self.vocab,
|
||||
"E": self.E.tolist(),
|
||||
"W": self.W.tolist(),
|
||||
"b": self.b.tolist(),
|
||||
}
|
||||
with open(filepath, "w") as f:
|
||||
json.dump(model_data, f)
|
||||
|
||||
def load(self, filepath):
|
||||
"Load a model from a JSON file."
|
||||
with open(filepath) as f:
|
||||
data = json.load(f)
|
||||
self.n = data["n"]
|
||||
self.embedding_dim = data["embedding_dim"]
|
||||
self.vocab = data["vocab"]
|
||||
self.word_to_idx = {word: idx for idx, word in enumerate(self.vocab)}
|
||||
self.E = np.array(data["E"])
|
||||
self.W = np.array(data["W"])
|
||||
self.b = np.array(data["b"])
|
||||
31
tlm/tokenization.py
Normal file
31
tlm/tokenization.py
Normal file
@@ -0,0 +1,31 @@
|
||||
def compress_whitespace(text):
|
||||
"""Collapse sequences of whitespace into a single space."""
|
||||
import re
|
||||
return re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
|
||||
def _is_alpha_token(token):
|
||||
return all(c.isalpha() or c in " '" for c in token)
|
||||
|
||||
|
||||
def tokenize_text(text, options):
|
||||
"""Tokenize a raw text string according to the given options."""
|
||||
if 'lower' in options:
|
||||
text = text.lower()
|
||||
words = text.split()
|
||||
if 'alpha' in options:
|
||||
words = [w for w in words if _is_alpha_token(w)]
|
||||
if 'char' in options:
|
||||
return list(compress_whitespace(' '.join(words)))
|
||||
return words
|
||||
|
||||
|
||||
def tokenize_words(words, options):
|
||||
"""Apply tokenization options to an already word-tokenized sequence."""
|
||||
if 'lower' in options:
|
||||
words = [w.lower() for w in words]
|
||||
if 'alpha' in options:
|
||||
words = [w for w in words if _is_alpha_token(w)]
|
||||
if 'char' in options:
|
||||
return list(compress_whitespace(' '.join(words)))
|
||||
return list(words)
|
||||
Reference in New Issue
Block a user