Add lots of features

This commit is contained in:
chris
2026-02-19 14:59:57 -05:00
parent f372786dbc
commit 01f58ded9a
5 changed files with 91 additions and 23 deletions

31
tlm/tokenization.py Normal file
View File

@@ -0,0 +1,31 @@
def compress_whitespace(text):
"""Collapse sequences of whitespace into a single space."""
import re
return re.sub(r'\s+', ' ', text).strip()
def _is_alpha_token(token):
return all(c.isalpha() or c in " '" for c in token)
def tokenize_text(text, options):
"""Tokenize a raw text string according to the given options."""
if 'lower' in options:
text = text.lower()
words = text.split()
if 'alpha' in options:
words = [w for w in words if _is_alpha_token(w)]
if 'char' in options:
return list(compress_whitespace(' '.join(words)))
return words
def tokenize_words(words, options):
"""Apply tokenization options to an already word-tokenized sequence."""
if 'lower' in options:
words = [w.lower() for w in words]
if 'alpha' in options:
words = [w for w in words if _is_alpha_token(w)]
if 'char' in options:
return list(compress_whitespace(' '.join(words)))
return list(words)