Files
lab_classification_features/cli/data.py
Chris Proctor aaf5b17ad8 Initial commit
2026-06-06 21:36:59 -04:00

33 lines
844 B
Python

import io
import os
import zipfile
import urllib.request
import pandas as pd
from sklearn.datasets import get_data_home
URL = (
"https://archive.ics.uci.edu/ml/machine-learning-databases/"
"00228/smsspamcollection.zip"
)
def load_spam():
path = os.path.join(get_data_home(), "spam", "SMSSpamCollection")
if not os.path.exists(path):
_fetch(path)
return pd.read_csv(path, sep="\t", header=None, names=["label", "message"])
def _fetch(dest):
os.makedirs(os.path.dirname(dest), exist_ok=True)
print("Downloading SMS Spam Collection...")
with urllib.request.urlopen(URL) as response:
data = response.read()
with zipfile.ZipFile(io.BytesIO(data)) as zf:
with zf.open("SMSSpamCollection") as f:
content = f.read()
with open(dest, "wb") as f:
f.write(content)