lab_classification_features/cli/data.py

import io
import os
import zipfile
import urllib.request

import pandas as pd
from sklearn.datasets import get_data_home


URL = (
    "https://archive.ics.uci.edu/ml/machine-learning-databases/"
    "00228/smsspamcollection.zip"
)


def load_spam():
    path = os.path.join(get_data_home(), "spam", "SMSSpamCollection")
    if not os.path.exists(path):
        _fetch(path)
    return pd.read_csv(path, sep="\t", header=None, names=["label", "message"])


def _fetch(dest):
    os.makedirs(os.path.dirname(dest), exist_ok=True)
    print("Downloading SMS Spam Collection...")
    with urllib.request.urlopen(URL) as response:
        data = response.read()
    with zipfile.ZipFile(io.BytesIO(data)) as zf:
        with zf.open("SMSSpamCollection") as f:
            content = f.read()
    with open(dest, "wb") as f:
        f.write(content)