lab_matrices/tlm/tokenization.py

def compress_whitespace(text):
    """Collapse sequences of whitespace into a single space."""
    import re
    return re.sub(r'\s+', ' ', text).strip()


def _is_alpha_token(token):
    return all(c.isalpha() or c in " '" for c in token)


def tokenize_text(text, options):
    """Tokenize a raw text string according to the given options."""
    if 'lower' in options:
        text = text.lower()
    words = text.split()
    if 'alpha' in options:
        words = [w for w in words if _is_alpha_token(w)]
    if 'char' in options:
        return list(compress_whitespace(' '.join(words)))
    return words


def tokenize_words(words, options):
    """Apply tokenization options to an already word-tokenized sequence."""
    if 'lower' in options:
        words = [w.lower() for w in words]
    if 'alpha' in options:
        words = [w for w in words if _is_alpha_token(w)]
    if 'char' in options:
        return list(compress_whitespace(' '.join(words)))
    return list(words)