from collections import Counter import numpy as np from sklearn.feature_extraction import DictVectorizer from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from classifiers.cleaning import LowercaseTransformer, PunctuationRemover class FeatureExtractor: def fit(self, X, y=None): return self def transform(self, X): return [self.extract_features(msg) for msg in X] def extract_features(self, message): return dict(Counter(message.split())) class BagOfWordsClassifier: def fit(self, X, y): self._pipeline = Pipeline([ ("lowercase", LowercaseTransformer()), ("punctuation", PunctuationRemover()), ("features", FeatureExtractor()), ("vectorizer", DictVectorizer()), ("classifier", LogisticRegression(max_iter=1000)), ]) y_binary = (np.array(y) == "spam").astype(int) self._pipeline.fit(X, y_binary) return self def predict(self, X): y_binary = self._pipeline.predict(X) return np.where(y_binary == 1, "spam", "ham") def feature_weights(self, top_n=10): vectorizer = self._pipeline.named_steps["vectorizer"] classifier = self._pipeline.named_steps["classifier"] names = vectorizer.get_feature_names_out() weights = classifier.coef_[0] pairs = sorted(zip(names, weights), key=lambda x: x[1]) half = top_n // 2 return pairs[-half:][::-1] + pairs[:half]