Initial commit
This commit is contained in:
63
tlm/helpers.py
Normal file
63
tlm/helpers.py
Normal file
@@ -0,0 +1,63 @@
|
||||
import mailbox
|
||||
import email
|
||||
from email.policy import default
|
||||
from tqdm import tqdm
|
||||
|
||||
def rolling_window(iterable, n):
|
||||
"""Passes a rolling window over the iterable, yielding each n-length tuple.
|
||||
rolling_window(range(5), 3) -> (0, 1, 2), (1, 2, 3), (2, 3, 4)
|
||||
"""
|
||||
it = iter(iterable)
|
||||
try:
|
||||
window = [next(it) for _ in range(n)]
|
||||
while True:
|
||||
yield tuple(window)
|
||||
window = window[1:] + [next(it)]
|
||||
except StopIteration:
|
||||
return
|
||||
|
||||
def read_mail_text(mbox_path):
|
||||
"""
|
||||
Extract and concatenate all plaintext content from an mbox file.
|
||||
"""
|
||||
texts = []
|
||||
mbox = mailbox.mbox(
|
||||
mbox_path,
|
||||
factory=lambda f: email.message_from_binary_file(f, policy=default)
|
||||
)
|
||||
for msg in tqdm(mbox):
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
if part.get_content_type() == "text/plain":
|
||||
try:
|
||||
text = part.get_content()
|
||||
if text:
|
||||
texts.append(text.strip())
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
if msg.get_content_type() == "text/plain":
|
||||
try:
|
||||
text = msg.get_content()
|
||||
if text:
|
||||
texts.append(text.strip())
|
||||
except Exception:
|
||||
pass
|
||||
return "\n\n".join(texts)
|
||||
|
||||
def clean_corpus(corpus, max_length=10, remove_numbers=False, exclude=None):
|
||||
result = []
|
||||
for word in corpus:
|
||||
if max_length and len(word) > max_length:
|
||||
continue
|
||||
if remove_numbers and word.isnumeric():
|
||||
continue
|
||||
if exclude and word in exclude:
|
||||
continue
|
||||
result.append(word)
|
||||
return result
|
||||
|
||||
def join(tokens, punctuation=".,?!:;'\""):
|
||||
"Joins text, but does not give extra space for punctuation"
|
||||
tokens = [t if t in punctuation else ' ' + t for t in tokens]
|
||||
return ''.join(tokens).strip()
|
||||
Reference in New Issue
Block a user