Initial commit

This commit is contained in:
mwc
2026-02-09 12:15:12 -05:00
commit ce251fddbe
10 changed files with 410 additions and 0 deletions

63
tlm/helpers.py Normal file
View File

@@ -0,0 +1,63 @@
import mailbox
import email
from email.policy import default
from tqdm import tqdm
def rolling_window(iterable, n):
"""Passes a rolling window over the iterable, yielding each n-length tuple.
rolling_window(range(5), 3) -> (0, 1, 2), (1, 2, 3), (2, 3, 4)
"""
it = iter(iterable)
try:
window = [next(it) for _ in range(n)]
while True:
yield tuple(window)
window = window[1:] + [next(it)]
except StopIteration:
return
def read_mail_text(mbox_path):
"""
Extract and concatenate all plaintext content from an mbox file.
"""
texts = []
mbox = mailbox.mbox(
mbox_path,
factory=lambda f: email.message_from_binary_file(f, policy=default)
)
for msg in tqdm(mbox):
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain":
try:
text = part.get_content()
if text:
texts.append(text.strip())
except Exception:
pass
else:
if msg.get_content_type() == "text/plain":
try:
text = msg.get_content()
if text:
texts.append(text.strip())
except Exception:
pass
return "\n\n".join(texts)
def clean_corpus(corpus, max_length=10, remove_numbers=False, exclude=None):
result = []
for word in corpus:
if max_length and len(word) > max_length:
continue
if remove_numbers and word.isnumeric():
continue
if exclude and word in exclude:
continue
result.append(word)
return result
def join(tokens, punctuation=".,?!:;'\""):
"Joins text, but does not give extra space for punctuation"
tokens = [t if t in punctuation else ' ' + t for t in tokens]
return ''.join(tokens).strip()