import io import os import zipfile import urllib.request import pandas as pd from sklearn.datasets import get_data_home URL = ( "https://archive.ics.uci.edu/ml/machine-learning-databases/" "00228/smsspamcollection.zip" ) def load_spam(): path = os.path.join(get_data_home(), "spam", "SMSSpamCollection") if not os.path.exists(path): _fetch(path) return pd.read_csv(path, sep="\t", header=None, names=["label", "message"]) def _fetch(dest): os.makedirs(os.path.dirname(dest), exist_ok=True) print("Downloading SMS Spam Collection...") with urllib.request.urlopen(URL) as response: data = response.read() with zipfile.ZipFile(io.BytesIO(data)) as zf: with zf.open("SMSSpamCollection") as f: content = f.read() with open(dest, "wb") as f: f.write(content)