from collections import Counter import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.feature_extraction import DictVectorizer from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from classifiers.feature_classifier import FeatureExtractor from cleaning.transformers import LowercaseTransformer, PunctuationRemover class BagOfWordsClassifier(BaseEstimator, ClassifierMixin): def __init__(self): self.cleaning = Pipeline([ ("lowercase", LowercaseTransformer()), ("punctuation", PunctuationRemover()), ]) def fit(self, X, y): X_clean = self.cleaning.fit_transform(X) self._pipeline = Pipeline([ ("features", FeatureExtractor(self.extract_features)), ("vectorizer", DictVectorizer()), ("classifier", LogisticRegression(max_iter=1000)), ]) y_binary = (np.array(y) == "spam").astype(int) self._pipeline.fit(X_clean, y_binary) return self def predict(self, X): X_clean = self.cleaning.transform(X) y_binary = self._pipeline.predict(X_clean) return np.where(y_binary == 1, "spam", "ham") def extract_features(self, message): return dict(Counter(message.split())) def feature_weights(self, top_n=10): vectorizer = self._pipeline.named_steps["vectorizer"] classifier = self._pipeline.named_steps["classifier"] names = vectorizer.get_feature_names_out() weights = classifier.coef_[0] pairs = sorted(zip(names, weights), key=lambda x: x[1]) half = top_n // 2 return pairs[-half:][::-1] + pairs[:half]