64 lines
2.0 KiB
Python
64 lines
2.0 KiB
Python
import mailbox
|
|
import email
|
|
from email.policy import default
|
|
from tqdm import tqdm
|
|
|
|
def rolling_window(iterable, n):
|
|
"""Passes a rolling window over the iterable, yielding each n-length tuple.
|
|
rolling_window(range(5), 3) -> (0, 1, 2), (1, 2, 3), (2, 3, 4)
|
|
"""
|
|
it = iter(iterable)
|
|
try:
|
|
window = [next(it) for _ in range(n)]
|
|
while True:
|
|
yield tuple(window)
|
|
window = window[1:] + [next(it)]
|
|
except StopIteration:
|
|
return
|
|
|
|
def read_mail_text(mbox_path):
|
|
"""
|
|
Extract and concatenate all plaintext content from an mbox file.
|
|
"""
|
|
texts = []
|
|
mbox = mailbox.mbox(
|
|
mbox_path,
|
|
factory=lambda f: email.message_from_binary_file(f, policy=default)
|
|
)
|
|
for msg in tqdm(mbox):
|
|
if msg.is_multipart():
|
|
for part in msg.walk():
|
|
if part.get_content_type() == "text/plain":
|
|
try:
|
|
text = part.get_content()
|
|
if text:
|
|
texts.append(text.strip())
|
|
except Exception:
|
|
pass
|
|
else:
|
|
if msg.get_content_type() == "text/plain":
|
|
try:
|
|
text = msg.get_content()
|
|
if text:
|
|
texts.append(text.strip())
|
|
except Exception:
|
|
pass
|
|
return "\n\n".join(texts)
|
|
|
|
def clean_corpus(corpus, max_length=10, remove_numbers=False, exclude=None):
|
|
result = []
|
|
for word in corpus:
|
|
if max_length and len(word) > max_length:
|
|
continue
|
|
if remove_numbers and word.isnumeric():
|
|
continue
|
|
if exclude and word in exclude:
|
|
continue
|
|
result.append(word)
|
|
return result
|
|
|
|
def join(tokens, punctuation=".,?!:;'\""):
|
|
"Joins text, but does not give extra space for punctuation"
|
|
tokens = [t if t in punctuation else ' ' + t for t in tokens]
|
|
return ''.join(tokens).strip()
|