diff --git a/classifiers/bag_of_words.py b/classifiers/bag_of_words.py index c7643c3..b6c687d 100644 --- a/classifiers/bag_of_words.py +++ b/classifiers/bag_of_words.py @@ -1,41 +1,41 @@ from collections import Counter import numpy as np -from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.feature_extraction import DictVectorizer from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline -from classifiers.feature_classifier import FeatureExtractor from cleaning.transformers import LowercaseTransformer, PunctuationRemover -class BagOfWordsClassifier(BaseEstimator, ClassifierMixin): - def __init__(self): - self.cleaning = Pipeline([ +class FeatureExtractor: + def fit(self, X, y=None): + return self + + def transform(self, X): + return [self.extract_features(msg) for msg in X] + + def extract_features(self, message): + return dict(Counter(message.split())) + + +class BagOfWordsClassifier: + def fit(self, X, y): + self._pipeline = Pipeline([ ("lowercase", LowercaseTransformer()), ("punctuation", PunctuationRemover()), - ]) - - def fit(self, X, y): - X_clean = self.cleaning.fit_transform(X) - self._pipeline = Pipeline([ - ("features", FeatureExtractor(self.extract_features)), + ("features", FeatureExtractor()), ("vectorizer", DictVectorizer()), ("classifier", LogisticRegression(max_iter=1000)), ]) y_binary = (np.array(y) == "spam").astype(int) - self._pipeline.fit(X_clean, y_binary) + self._pipeline.fit(X, y_binary) return self def predict(self, X): - X_clean = self.cleaning.transform(X) - y_binary = self._pipeline.predict(X_clean) + y_binary = self._pipeline.predict(X) return np.where(y_binary == 1, "spam", "ham") - def extract_features(self, message): - return dict(Counter(message.split())) - def feature_weights(self, top_n=10): vectorizer = self._pipeline.named_steps["vectorizer"] classifier = self._pipeline.named_steps["classifier"] diff --git a/classifiers/feature_classifier.py b/classifiers/feature_classifier.py index 3d4545c..dcd1f46 100644 --- a/classifiers/feature_classifier.py +++ b/classifiers/feature_classifier.py @@ -1,30 +1,30 @@ import numpy as np -from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin from sklearn.feature_extraction import DictVectorizer from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline -class FeatureExtractor(BaseEstimator, TransformerMixin): - def __init__(self, extract_fn): - self.extract_fn = extract_fn - +class FeatureExtractor: def fit(self, X, y=None): return self def transform(self, X): - return [self.extract_fn(msg) for msg in X] + return [self.extract_features(msg) for msg in X] + + def extract_features(self, message): + return { + "contains_free": int("free" in message.lower()), + "num_exclamations": message.count("!"), + "length": len(message), + } -class FeatureClassifier(BaseEstimator, ClassifierMixin): - def __init__(self, C=1.0): - self.C = C - +class FeatureClassifier: def fit(self, X, y): self._pipeline = Pipeline([ - ("features", FeatureExtractor(self.extract_features)), + ("features", FeatureExtractor()), ("vectorizer", DictVectorizer()), - ("classifier", LogisticRegression(C=self.C, max_iter=1000)), + ("classifier", LogisticRegression(max_iter=1000)), ]) y_binary = (np.array(y) == "spam").astype(int) self._pipeline.fit(X, y_binary) @@ -34,13 +34,6 @@ class FeatureClassifier(BaseEstimator, ClassifierMixin): y_binary = self._pipeline.predict(X) return np.where(y_binary == 1, "spam", "ham") - def extract_features(self, message): - return { - "contains_free": int("free" in message.lower()), - "num_exclamations": message.count("!"), - "length": len(message), - } - def feature_weights(self, top_n=10): vectorizer = self._pipeline.named_steps["vectorizer"] classifier = self._pipeline.named_steps["classifier"] diff --git a/classifiers/manual.py b/classifiers/manual.py index 357175d..7106c93 100644 --- a/classifiers/manual.py +++ b/classifiers/manual.py @@ -1,8 +1,7 @@ import numpy as np -from sklearn.base import BaseEstimator, ClassifierMixin -class ManualClassifier(BaseEstimator, ClassifierMixin): +class ManualClassifier: def fit(self, X, y): return self