import mailbox import email from email.policy import default from tqdm import tqdm def rolling_window(iterable, n): """Passes a rolling window over the iterable, yielding each n-length tuple. rolling_window(range(5), 3) -> (0, 1, 2), (1, 2, 3), (2, 3, 4) """ it = iter(iterable) try: window = [next(it) for _ in range(n)] while True: yield tuple(window) window = window[1:] + [next(it)] except StopIteration: return def read_mail_text(mbox_path): """ Extract and concatenate all plaintext content from an mbox file. """ texts = [] mbox = mailbox.mbox( mbox_path, factory=lambda f: email.message_from_binary_file(f, policy=default) ) for msg in tqdm(mbox): if msg.is_multipart(): for part in msg.walk(): if part.get_content_type() == "text/plain": try: text = part.get_content() if text: texts.append(text.strip()) except Exception: pass else: if msg.get_content_type() == "text/plain": try: text = msg.get_content() if text: texts.append(text.strip()) except Exception: pass return "\n\n".join(texts) def clean_corpus(corpus, max_length=10, remove_numbers=False, exclude=None): result = [] for word in corpus: if max_length and len(word) > max_length: continue if remove_numbers and word.isnumeric(): continue if exclude and word in exclude: continue result.append(word) return result def join(tokens, punctuation=".,?!:;'\""): "Joins text, but does not give extra space for punctuation" tokens = [t if t in punctuation else ' ' + t for t in tokens] return ''.join(tokens).strip()