import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin from sklearn.feature_extraction import DictVectorizer from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from cleaning.transformers import LowercaseTransformer class FeatureExtractor(BaseEstimator, TransformerMixin): def __init__(self, extract_fn): self.extract_fn = extract_fn def fit(self, X, y=None): return self def transform(self, X): return [self.extract_fn(msg) for msg in X] class FeatureClassifier(BaseEstimator, ClassifierMixin): def __init__(self, C=1.0): self.C = C def fit(self, X, y): self._pipeline = Pipeline([ ("lowercase", LowercaseTransformer()), ("features", FeatureExtractor(self.extract_features)), ("vec", DictVectorizer()), ("clf", LogisticRegression(C=self.C, max_iter=1000)), ]) y_binary = (np.array(y) == "spam").astype(int) self._pipeline.fit(X, y_binary) return self def predict(self, X): y_binary = self._pipeline.predict(X) return np.where(y_binary == 1, "spam", "ham") def extract_features(self, message): return { "contains_free": int("free" in message), "num_exclamations": message.count("!"), "length": len(message), } def feature_weights(self, top_n=10): vec = self._pipeline.named_steps["vec"] clf = self._pipeline.named_steps["clf"] names = vec.get_feature_names_out() weights = clf.coef_[0] pairs = sorted(zip(names, weights), key=lambda x: abs(x[1]), reverse=True) return pairs[:top_n]