32 lines
985 B
Python
32 lines
985 B
Python
def compress_whitespace(text):
|
|
"""Collapse sequences of whitespace into a single space."""
|
|
import re
|
|
return re.sub(r'\s+', ' ', text).strip()
|
|
|
|
|
|
def _is_alpha_token(token):
|
|
return all(c.isalpha() or c in " '" for c in token)
|
|
|
|
|
|
def tokenize_text(text, options):
|
|
"""Tokenize a raw text string according to the given options."""
|
|
if 'lower' in options:
|
|
text = text.lower()
|
|
words = text.split()
|
|
if 'alpha' in options:
|
|
words = [w for w in words if _is_alpha_token(w)]
|
|
if 'char' in options:
|
|
return list(compress_whitespace(' '.join(words)))
|
|
return words
|
|
|
|
|
|
def tokenize_words(words, options):
|
|
"""Apply tokenization options to an already word-tokenized sequence."""
|
|
if 'lower' in options:
|
|
words = [w.lower() for w in words]
|
|
if 'alpha' in options:
|
|
words = [w for w in words if _is_alpha_token(w)]
|
|
if 'char' in options:
|
|
return list(compress_whitespace(' '.join(words)))
|
|
return list(words)
|