import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin from sklearn.feature_extraction import DictVectorizer from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline class FeatureExtractor(BaseEstimator, TransformerMixin): def __init__(self, extract_fn): self.extract_fn = extract_fn def fit(self, X, y=None): return self def transform(self, X): return [self.extract_fn(msg) for msg in X] class FeatureClassifier(BaseEstimator, ClassifierMixin): def __init__(self, C=1.0): self.C = C def fit(self, X, y): self._pipeline = Pipeline([ ("features", FeatureExtractor(self.extract_features)), ("vectorizer", DictVectorizer()), ("classifier", LogisticRegression(C=self.C, max_iter=1000)), ]) y_binary = (np.array(y) == "spam").astype(int) self._pipeline.fit(X, y_binary) return self def predict(self, X): y_binary = self._pipeline.predict(X) return np.where(y_binary == 1, "spam", "ham") def extract_features(self, message): return { "contains_free": int("free" in message.lower()), "num_exclamations": message.count("!"), "length": len(message), } def feature_weights(self, top_n=10): vectorizer = self._pipeline.named_steps["vectorizer"] classifier = self._pipeline.named_steps["classifier"] names = vectorizer.get_feature_names_out() weights = classifier.coef_[0] pairs = sorted(zip(names, weights), key=lambda x: abs(x[1]), reverse=True) return pairs[:top_n]