Simplify cleaning transformers and shorten module names
Move cleaning transformers into classifiers/cleaning.py (dropping the separate cleaning package) and implement them as plain classes rather than BaseEstimator/TransformerMixin subclasses, since Pipeline only needs fit/transform via duck typing. Also rename feature_classifier.py and bag_of_words.py to features.py and bow.py for brevity. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -5,7 +5,7 @@ from sklearn.feature_extraction import DictVectorizer
|
|||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.pipeline import Pipeline
|
from sklearn.pipeline import Pipeline
|
||||||
|
|
||||||
from cleaning.transformers import LowercaseTransformer, PunctuationRemover
|
from classifiers.cleaning import LowercaseTransformer, PunctuationRemover
|
||||||
|
|
||||||
|
|
||||||
class FeatureExtractor:
|
class FeatureExtractor:
|
||||||
@@ -1,7 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.base import BaseEstimator, TransformerMixin
|
|
||||||
|
|
||||||
|
|
||||||
STOPWORDS = {
|
STOPWORDS = {
|
||||||
@@ -14,18 +13,16 @@ STOPWORDS = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class LowercaseTransformer(BaseEstimator, TransformerMixin):
|
class LowercaseTransformer:
|
||||||
def fit(self, X, y=None):
|
def fit(self, X, y=None):
|
||||||
self.fitted_ = True
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def transform(self, X):
|
def transform(self, X):
|
||||||
return np.array([msg.lower() for msg in X])
|
return np.array([msg.lower() for msg in X])
|
||||||
|
|
||||||
|
|
||||||
class StopwordRemover(BaseEstimator, TransformerMixin):
|
class StopwordRemover:
|
||||||
def fit(self, X, y=None):
|
def fit(self, X, y=None):
|
||||||
self.fitted_ = True
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def transform(self, X):
|
def transform(self, X):
|
||||||
@@ -36,9 +33,8 @@ class StopwordRemover(BaseEstimator, TransformerMixin):
|
|||||||
return " ".join(w for w in words if w.lower() not in STOPWORDS)
|
return " ".join(w for w in words if w.lower() not in STOPWORDS)
|
||||||
|
|
||||||
|
|
||||||
class PunctuationRemover(BaseEstimator, TransformerMixin):
|
class PunctuationRemover:
|
||||||
def fit(self, X, y=None):
|
def fit(self, X, y=None):
|
||||||
self.fitted_ = True
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def transform(self, X):
|
def transform(self, X):
|
||||||
@@ -3,7 +3,7 @@
|
|||||||
Usage:
|
Usage:
|
||||||
spam -e
|
spam -e
|
||||||
spam classifiers.manual.ManualClassifier
|
spam classifiers.manual.ManualClassifier
|
||||||
spam classifiers.feature_classifier.FeatureClassifier
|
spam classifiers.features.FeatureClassifier
|
||||||
spam classifiers.manual.ManualClassifier -t 0.2
|
spam classifiers.manual.ManualClassifier -t 0.2
|
||||||
spam classifiers.manual.ManualClassifier -a
|
spam classifiers.manual.ManualClassifier -a
|
||||||
spam classifiers.manual.ManualClassifier -a 5
|
spam classifiers.manual.ManualClassifier -a 5
|
||||||
|
|||||||
@@ -16,4 +16,4 @@ requires = ["hatchling"]
|
|||||||
build-backend = "hatchling.build"
|
build-backend = "hatchling.build"
|
||||||
|
|
||||||
[tool.hatch.build.targets.wheel]
|
[tool.hatch.build.targets.wheel]
|
||||||
packages = ["cli", "classifiers", "cleaning"]
|
packages = ["cli", "classifiers"]
|
||||||
|
|||||||
Reference in New Issue
Block a user