Files
lab_classification_features/cli/main.py
Chris Proctor 61eb5a150c Rename classifiers/ package to models/
Aligns module naming with the upcoming classification_neural lab,
which will use the same models/ package convention.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-08 10:57:25 -04:00

98 lines
2.4 KiB
Python

"""Evaluate a spam classifier.
Usage:
spam -e
spam models.manual.ManualClassifier
spam models.features.FeatureClassifier
spam models.manual.ManualClassifier -t 0.2
spam models.manual.ManualClassifier -a
spam models.manual.ManualClassifier -a 5
spam models.manual.ManualClassifier -f 20
"""
import argparse
import importlib
from sklearn.model_selection import train_test_split
import cli.output as out
from cli.data import load_spam
def load_classifier(class_path):
module_path, class_name = class_path.rsplit(".", 1)
module = importlib.import_module(module_path)
return getattr(module, class_name)()
def main():
parser = argparse.ArgumentParser(
description="Train and evaluate a spam classifier."
)
parser.add_argument(
"classifier",
nargs="?",
help="Fully-qualified class, e.g. models.manual.ManualClassifier",
)
parser.add_argument(
"-e", "--explore",
action="store_true",
help="Drop into an interactive shell with the dataset loaded as `df`",
)
parser.add_argument(
"-t", "--test-ratio",
type=float,
default=0.3,
help="Fraction held out for testing (default: 0.3)",
)
parser.add_argument(
"-a", "--error-analysis",
type=int,
nargs="?",
const=10,
default=None,
metavar="N",
help="Show up to N misclassified examples (default: 10)",
)
parser.add_argument(
"-f", "--top-features",
type=int,
default=10,
metavar="N",
help="Show the top N features by weight (default: 10)",
)
args = parser.parse_args()
if not args.classifier and not args.explore:
parser.print_help()
return
df = load_spam()
if args.explore:
out.explore(df)
if not args.classifier:
return
X = df["message"].values
y = df["label"].values
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=args.test_ratio, random_state=42
)
out.dataset_summary(df, len(X_train), len(X_test))
clf = load_classifier(args.classifier)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
out.evaluation(y_test, y_pred, type(clf).__name__)
out.feature_weights(clf, top_n=args.top_features)
if args.error_analysis is not None:
out.error_analysis(X_test, y_test, y_pred, args.error_analysis)
if __name__ == "__main__":
main()