lab_classification_features/cli/main.py

"""Evaluate a spam classifier.

Usage:
    spam -e
    spam models.manual.ManualClassifier
    spam models.features.FeatureClassifier
    spam models.manual.ManualClassifier -t 0.2
    spam models.manual.ManualClassifier -a
    spam models.manual.ManualClassifier -a 5
    spam models.manual.ManualClassifier -f 20
"""

import argparse
import importlib

from sklearn.model_selection import train_test_split

import cli.output as out
from cli.data import load_spam


def load_classifier(class_path):
    module_path, class_name = class_path.rsplit(".", 1)
    module = importlib.import_module(module_path)
    return getattr(module, class_name)()


def main():
    parser = argparse.ArgumentParser(
        description="Train and evaluate a spam classifier."
    )
    parser.add_argument(
        "classifier",
        nargs="?",
        help="Fully-qualified class, e.g. models.manual.ManualClassifier",
    )
    parser.add_argument(
        "-e", "--explore",
        action="store_true",
        help="Drop into an interactive shell with the dataset loaded as `df`",
    )
    parser.add_argument(
        "-t", "--test-ratio",
        type=float,
        default=0.3,
        help="Fraction held out for testing (default: 0.3)",
    )
    parser.add_argument(
        "-a", "--error-analysis",
        type=int,
        nargs="?",
        const=10,
        default=None,
        metavar="N",
        help="Show up to N misclassified examples (default: 10)",
    )
    parser.add_argument(
        "-f", "--top-features",
        type=int,
        default=10,
        metavar="N",
        help="Show the top N features by weight (default: 10)",
    )
    args = parser.parse_args()

    if not args.classifier and not args.explore:
        parser.print_help()
        return

    df = load_spam()

    if args.explore:
        out.explore(df)
        if not args.classifier:
            return

    X = df["message"].values
    y = df["label"].values
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=args.test_ratio, random_state=42
    )

    out.dataset_summary(df, len(X_train), len(X_test))

    clf = load_classifier(args.classifier)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    out.evaluation(y_test, y_pred, type(clf).__name__)
    out.feature_weights(clf, top_n=args.top_features)

    if args.error_analysis is not None:
        out.error_analysis(X_test, y_test, y_pred, args.error_analysis)


if __name__ == "__main__":
    main()