33 lines
844 B
Python
33 lines
844 B
Python
import io
|
|
import os
|
|
import zipfile
|
|
import urllib.request
|
|
|
|
import pandas as pd
|
|
from sklearn.datasets import get_data_home
|
|
|
|
|
|
URL = (
|
|
"https://archive.ics.uci.edu/ml/machine-learning-databases/"
|
|
"00228/smsspamcollection.zip"
|
|
)
|
|
|
|
|
|
def load_spam():
|
|
path = os.path.join(get_data_home(), "spam", "SMSSpamCollection")
|
|
if not os.path.exists(path):
|
|
_fetch(path)
|
|
return pd.read_csv(path, sep="\t", header=None, names=["label", "message"])
|
|
|
|
|
|
def _fetch(dest):
|
|
os.makedirs(os.path.dirname(dest), exist_ok=True)
|
|
print("Downloading SMS Spam Collection...")
|
|
with urllib.request.urlopen(URL) as response:
|
|
data = response.read()
|
|
with zipfile.ZipFile(io.BytesIO(data)) as zf:
|
|
with zf.open("SMSSpamCollection") as f:
|
|
content = f.read()
|
|
with open(dest, "wb") as f:
|
|
f.write(content)
|