Add lots of features
This commit is contained in:
31
tlm/tokenization.py
Normal file
31
tlm/tokenization.py
Normal file
@@ -0,0 +1,31 @@
|
||||
def compress_whitespace(text):
|
||||
"""Collapse sequences of whitespace into a single space."""
|
||||
import re
|
||||
return re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
|
||||
def _is_alpha_token(token):
|
||||
return all(c.isalpha() or c in " '" for c in token)
|
||||
|
||||
|
||||
def tokenize_text(text, options):
|
||||
"""Tokenize a raw text string according to the given options."""
|
||||
if 'lower' in options:
|
||||
text = text.lower()
|
||||
words = text.split()
|
||||
if 'alpha' in options:
|
||||
words = [w for w in words if _is_alpha_token(w)]
|
||||
if 'char' in options:
|
||||
return list(compress_whitespace(' '.join(words)))
|
||||
return words
|
||||
|
||||
|
||||
def tokenize_words(words, options):
|
||||
"""Apply tokenization options to an already word-tokenized sequence."""
|
||||
if 'lower' in options:
|
||||
words = [w.lower() for w in words]
|
||||
if 'alpha' in options:
|
||||
words = [w for w in words if _is_alpha_token(w)]
|
||||
if 'char' in options:
|
||||
return list(compress_whitespace(' '.join(words)))
|
||||
return list(words)
|
||||
Reference in New Issue
Block a user