Files
lab_classification_features/cli/main.py
Chris Proctor bbe8054910 Simplify cleaning transformers and shorten module names
Move cleaning transformers into classifiers/cleaning.py (dropping the
separate cleaning package) and implement them as plain classes rather
than BaseEstimator/TransformerMixin subclasses, since Pipeline only
needs fit/transform via duck typing. Also rename feature_classifier.py
and bag_of_words.py to features.py and bow.py for brevity.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-08 10:02:41 -04:00

90 lines
2.2 KiB
Python

"""Evaluate a spam classifier.
Usage:
spam -e
spam classifiers.manual.ManualClassifier
spam classifiers.features.FeatureClassifier
spam classifiers.manual.ManualClassifier -t 0.2
spam classifiers.manual.ManualClassifier -a
spam classifiers.manual.ManualClassifier -a 5
"""
import argparse
import importlib
from sklearn.model_selection import train_test_split
import cli.output as out
from cli.data import load_spam
def load_classifier(class_path):
module_path, class_name = class_path.rsplit(".", 1)
module = importlib.import_module(module_path)
return getattr(module, class_name)()
def main():
parser = argparse.ArgumentParser(
description="Train and evaluate a spam classifier."
)
parser.add_argument(
"classifier",
nargs="?",
help="Fully-qualified class, e.g. classifiers.manual.ManualClassifier",
)
parser.add_argument(
"-e", "--explore",
action="store_true",
help="Drop into an interactive shell with the dataset loaded as `df`",
)
parser.add_argument(
"-t", "--test-ratio",
type=float,
default=0.3,
help="Fraction held out for testing (default: 0.3)",
)
parser.add_argument(
"-a", "--error-analysis",
type=int,
nargs="?",
const=10,
default=None,
metavar="N",
help="Show up to N misclassified examples (default: 10)",
)
args = parser.parse_args()
if not args.classifier and not args.explore:
parser.print_help()
return
df = load_spam()
if args.explore:
out.explore(df)
if not args.classifier:
return
X = df["message"].values
y = df["label"].values
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=args.test_ratio, random_state=42
)
out.dataset_summary(df, len(X_train), len(X_test))
clf = load_classifier(args.classifier)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
out.evaluation(y_test, y_pred, type(clf).__name__)
out.feature_weights(clf)
if args.error_analysis is not None:
out.error_analysis(X_test, y_test, y_pred, args.error_analysis)
if __name__ == "__main__":
main()