Add lots of features

2026-02-19 14:59:57 -05:00
parent f372786dbc
commit 01f58ded9a
5 changed files with 91 additions and 23 deletions
--- a/tlm/tokenization.py
+++ b/tlm/tokenization.py
@@ -0,0 +1,31 @@
+def compress_whitespace(text):
+    """Collapse sequences of whitespace into a single space."""
+    import re
+    return re.sub(r'\s+', ' ', text).strip()
+
+
+def _is_alpha_token(token):
+    return all(c.isalpha() or c in " '" for c in token)
+
+
+def tokenize_text(text, options):
+    """Tokenize a raw text string according to the given options."""
+    if 'lower' in options:
+        text = text.lower()
+    words = text.split()
+    if 'alpha' in options:
+        words = [w for w in words if _is_alpha_token(w)]
+    if 'char' in options:
+        return list(compress_whitespace(' '.join(words)))
+    return words
+
+
+def tokenize_words(words, options):
+    """Apply tokenization options to an already word-tokenized sequence."""
+    if 'lower' in options:
+        words = [w.lower() for w in words]
+    if 'alpha' in options:
+        words = [w for w in words if _is_alpha_token(w)]
+    if 'char' in options:
+        return list(compress_whitespace(' '.join(words)))
+    return list(words)