diff --git a/classifiers/bag_of_words.py b/classifiers/bag_of_words.py new file mode 100644 index 0000000..c7643c3 --- /dev/null +++ b/classifiers/bag_of_words.py @@ -0,0 +1,46 @@ +from collections import Counter + +import numpy as np +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.feature_extraction import DictVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.pipeline import Pipeline + +from classifiers.feature_classifier import FeatureExtractor +from cleaning.transformers import LowercaseTransformer, PunctuationRemover + + +class BagOfWordsClassifier(BaseEstimator, ClassifierMixin): + def __init__(self): + self.cleaning = Pipeline([ + ("lowercase", LowercaseTransformer()), + ("punctuation", PunctuationRemover()), + ]) + + def fit(self, X, y): + X_clean = self.cleaning.fit_transform(X) + self._pipeline = Pipeline([ + ("features", FeatureExtractor(self.extract_features)), + ("vectorizer", DictVectorizer()), + ("classifier", LogisticRegression(max_iter=1000)), + ]) + y_binary = (np.array(y) == "spam").astype(int) + self._pipeline.fit(X_clean, y_binary) + return self + + def predict(self, X): + X_clean = self.cleaning.transform(X) + y_binary = self._pipeline.predict(X_clean) + return np.where(y_binary == 1, "spam", "ham") + + def extract_features(self, message): + return dict(Counter(message.split())) + + def feature_weights(self, top_n=10): + vectorizer = self._pipeline.named_steps["vectorizer"] + classifier = self._pipeline.named_steps["classifier"] + names = vectorizer.get_feature_names_out() + weights = classifier.coef_[0] + pairs = sorted(zip(names, weights), key=lambda x: x[1]) + half = top_n // 2 + return pairs[-half:][::-1] + pairs[:half] diff --git a/classifiers/feature_classifier.py b/classifiers/feature_classifier.py index f92cda0..3d4545c 100644 --- a/classifiers/feature_classifier.py +++ b/classifiers/feature_classifier.py @@ -4,8 +4,6 @@ from sklearn.feature_extraction import DictVectorizer from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline -from cleaning.transformers import LowercaseTransformer - class FeatureExtractor(BaseEstimator, TransformerMixin): def __init__(self, extract_fn): @@ -24,10 +22,9 @@ class FeatureClassifier(BaseEstimator, ClassifierMixin): def fit(self, X, y): self._pipeline = Pipeline([ - ("lowercase", LowercaseTransformer()), ("features", FeatureExtractor(self.extract_features)), - ("vec", DictVectorizer()), - ("clf", LogisticRegression(C=self.C, max_iter=1000)), + ("vectorizer", DictVectorizer()), + ("classifier", LogisticRegression(C=self.C, max_iter=1000)), ]) y_binary = (np.array(y) == "spam").astype(int) self._pipeline.fit(X, y_binary) @@ -39,15 +36,15 @@ class FeatureClassifier(BaseEstimator, ClassifierMixin): def extract_features(self, message): return { - "contains_free": int("free" in message), + "contains_free": int("free" in message.lower()), "num_exclamations": message.count("!"), "length": len(message), } def feature_weights(self, top_n=10): - vec = self._pipeline.named_steps["vec"] - clf = self._pipeline.named_steps["clf"] - names = vec.get_feature_names_out() - weights = clf.coef_[0] + vectorizer = self._pipeline.named_steps["vectorizer"] + classifier = self._pipeline.named_steps["classifier"] + names = vectorizer.get_feature_names_out() + weights = classifier.coef_[0] pairs = sorted(zip(names, weights), key=lambda x: abs(x[1]), reverse=True) return pairs[:top_n] diff --git a/classifiers/manual_cleaning.py b/classifiers/manual_cleaning.py deleted file mode 100644 index 0eeedc3..0000000 --- a/classifiers/manual_cleaning.py +++ /dev/null @@ -1,20 +0,0 @@ -import numpy as np -from sklearn.pipeline import Pipeline - -from classifiers.manual import ManualClassifier -from cleaning.transformers import LowercaseTransformer - - -class ManualCleaningClassifier(ManualClassifier): - def __init__(self): - self.cleaning = Pipeline([ - ("lowercase", LowercaseTransformer()), - ]) - - def fit(self, X, y): - self.cleaning.fit(X) - return self - - def predict(self, X): - X_clean = self.cleaning.transform(X) - return np.array([self.predict_one(msg) for msg in X_clean]) diff --git a/questions.md b/questions.md index 5847b63..c9a143b 100644 --- a/questions.md +++ b/questions.md @@ -44,25 +44,9 @@ --- -## Checkpoint 3: Data Cleaning +## Checkpoint 3: Designing Features by Hand -**8. Which transformers did you add to the pipeline, and in what order?** - -*Your answer:* - -**9. Did any transformer hurt performance? Why might cleaning sometimes make things worse?** - -*Your answer:* - -**10. If you removed the lowercasing from your `predict_one` method now that the pipeline handles it, would your results change?** - -*Your answer:* - ---- - -## Checkpoint 4: Feature Engineering - -**11. List all the features you implemented and the reasoning behind each:** +**8. List all the features you implemented and the reasoning behind each:** | Feature name | What it measures | Reasoning | |-------------|-----------------|-----------| @@ -70,7 +54,7 @@ | | | | | | | | -**12. Record your best results:** +**9. Record your best results:** | Metric | Value | |--------|-------| @@ -78,11 +62,35 @@ | Spam recall | | | Spam F1 | | -**13. Which features received the largest positive weights (most predictive of spam)?** +**10. Which features received the largest positive weights (most predictive of spam)? The largest negative weights (predictive of ham)? Does this match your expectations?** *Your answer:* -**14. Which features received near-zero weights? Why might the model have ignored them?** +**11. Did any feature you thought would help receive a near-zero weight? Why might the model have decided it was unimportant?** + +*Your answer:* + +--- + +## Checkpoint 4: Bag of Words + +**12. Which transformers did you include in your cleaning pipeline, and in what order? Explain how each one changes the vocabulary.** + +*Your answer:* + +**13. Record your best results:** + +| Metric | Value | +|--------|-------| +| Spam precision | | +| Spam recall | | +| Spam F1 | | + +**14. How did the bag-of-words classifier's performance compare to your best hand-designed-features classifier? What do you think accounts for the difference?** + +*Your answer:* + +**15. Look at the words with the strongest weights (in either direction). Do any surprise you? What do they suggest about how the model is making its decisions?** *Your answer:* @@ -90,7 +98,7 @@ ## Final Questions -**15. Pick a different classification problem (for example: positive vs. negative movie reviews, +**16. Pick a different classification problem (for example: positive vs. negative movie reviews, news articles vs. opinion pieces, or medical vs. general-audience text). Propose five features you would extract to classify it, and explain your reasoning.** @@ -104,17 +112,11 @@ Problem I chose: | | | | | | | | -**16. Could adding more features ever *hurt* the performance of a classifier? Explain +**17. Could adding more features ever *hurt* the performance of a classifier? Explain when and why this might happen.** *Your answer:* -**17. It would be possible to make every word in the vocabulary a feature, where each -feature indicates whether that word appears in the message. Explain how this could -be implemented. Do you think it would work well? What are the trade-offs?** - -*Your answer:* - **18. In this lab you split the data into 70% training and 30% testing. What would happen if you used 99% for training and 1% for testing? What about 1% for training and 99% for testing?**