Files
lab_matrices/tlm/tokenization.py
2026-03-02 12:29:18 -05:00

32 lines
985 B
Python

def compress_whitespace(text):
"""Collapse sequences of whitespace into a single space."""
import re
return re.sub(r'\s+', ' ', text).strip()
def _is_alpha_token(token):
return all(c.isalpha() or c in " '" for c in token)
def tokenize_text(text, options):
"""Tokenize a raw text string according to the given options."""
if 'lower' in options:
text = text.lower()
words = text.split()
if 'alpha' in options:
words = [w for w in words if _is_alpha_token(w)]
if 'char' in options:
return list(compress_whitespace(' '.join(words)))
return words
def tokenize_words(words, options):
"""Apply tokenization options to an already word-tokenized sequence."""
if 'lower' in options:
words = [w.lower() for w in words]
if 'alpha' in options:
words = [w for w in words if _is_alpha_token(w)]
if 'char' in options:
return list(compress_whitespace(' '.join(words)))
return list(words)