def compress_whitespace(text): """Collapse sequences of whitespace into a single space.""" import re return re.sub(r'\s+', ' ', text).strip() def _is_alpha_token(token): return all(c.isalpha() or c in " '" for c in token) def tokenize_text(text, options): """Tokenize a raw text string according to the given options.""" if 'lower' in options: text = text.lower() words = text.split() if 'alpha' in options: words = [w for w in words if _is_alpha_token(w)] if 'char' in options: return list(compress_whitespace(' '.join(words))) return words def tokenize_words(words, options): """Apply tokenization options to an already word-tokenized sequence.""" if 'lower' in options: words = [w.lower() for w in words] if 'alpha' in options: words = [w for w in words if _is_alpha_token(w)] if 'char' in options: return list(compress_whitespace(' '.join(words))) return list(words)