diff --git a/.gitignore b/.gitignore index 2cceedb..2ec3657 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ **/__pycache__/* +weights/ diff --git a/cli/main.py b/cli/main.py index 39fcffe..b3f6c9f 100644 --- a/cli/main.py +++ b/cli/main.py @@ -2,20 +2,26 @@ Usage: digits -e - digits models.handpicked.HandPickedClassifier + digits -e 10 + digits models.features.FeatureClassifier digits models.pixels.PixelClassifier digits models.mlp.MLPClassifier digits models.mlp.MLPClassifier --hidden 64 64 digits models.mlp.MLPClassifier -a digits models.cnn.CNNClassifier --epochs 3 digits models.cnn.CNNClassifier -a 5 + digits models.cnn.CNNClassifier --save weights/cnn + digits weights/cnn + digits weights/cnn --run """ import argparse import importlib import cli.output as out +import cli.webcam as webcam from cli.data import load_mnist +from cli.persistence import is_saved_model, load_model, save_model def load_classifier(class_path, **kwargs): @@ -33,12 +39,17 @@ def main(): parser.add_argument( "classifier", nargs="?", - help="Fully-qualified class, e.g. models.mlp.MLPClassifier", + help="Fully-qualified class (e.g. models.mlp.MLPClassifier), " + "or the path to a model saved with --save", ) parser.add_argument( "-e", "--explore", - action="store_true", - help="Show sample digits and the label distribution", + type=int, + nargs="?", + const=3, + default=None, + metavar="N", + help="Show N sample digits and the label distribution (default: 3)", ) parser.add_argument( "-a", "--error-analysis", @@ -64,27 +75,45 @@ def main(): metavar="N", help="Number of training epochs (MLPClassifier and CNNClassifier only)", ) + parser.add_argument( + "--save", + metavar="DIR", + help="After training, save the model's configuration and weights to DIR", + ) + parser.add_argument( + "--run", + action="store_true", + help="Open the webcam and classify handwritten digits live", + ) args = parser.parse_args() - if not args.classifier and not args.explore: + if not args.classifier and args.explore is None: parser.print_help() return X_train, X_test, y_train, y_test = load_mnist() - if args.explore: - out.explore(X_train, y_train) + if args.explore is not None: + out.explore(X_train, y_train, args.explore) if not args.classifier: return out.dataset_summary(len(X_train), len(X_test)) - clf = load_classifier( - args.classifier, - hidden_sizes=tuple(args.hidden) if args.hidden else None, - epochs=args.epochs, - ) - clf.fit(X_train, y_train) + if is_saved_model(args.classifier): + clf = load_model(args.classifier) + print(f"Loaded saved model from {args.classifier}\n") + else: + clf = load_classifier( + args.classifier, + hidden_sizes=tuple(args.hidden) if args.hidden else None, + epochs=args.epochs, + ) + clf.fit(X_train, y_train) + if args.save: + save_model(clf, args.save) + print(f"Saved model to {args.save}\n") + y_pred = clf.predict(X_test) out.evaluation(y_test, y_pred, type(clf).__name__) @@ -92,6 +121,9 @@ def main(): if args.error_analysis is not None: out.error_analysis(X_test, y_test, y_pred, args.error_analysis) + if args.run: + webcam.run(clf) + if __name__ == "__main__": main() diff --git a/cli/output.py b/cli/output.py index 2f56cc7..717ac7a 100644 --- a/cli/output.py +++ b/cli/output.py @@ -8,11 +8,11 @@ def show_digit(pixels): print(" ".join("#" if p > 0.5 else "." for p in row)) -def explore(X_train, y_train): +def explore(X_train, y_train, n=3): print("=" * 60) print("SAMPLE DIGITS") print("=" * 60) - for i in range(3): + for i in range(n): print(f"\nLabel: {y_train[i]}") show_digit(X_train[i]) diff --git a/cli/persistence.py b/cli/persistence.py new file mode 100644 index 0000000..3d9c5b9 --- /dev/null +++ b/cli/persistence.py @@ -0,0 +1,18 @@ +import os + +import joblib + +MODEL_FILE = "model.joblib" + + +def is_saved_model(path): + return os.path.isdir(path) and os.path.exists(os.path.join(path, MODEL_FILE)) + + +def save_model(clf, directory): + os.makedirs(directory, exist_ok=True) + joblib.dump(clf, os.path.join(directory, MODEL_FILE)) + + +def load_model(directory): + return joblib.load(os.path.join(directory, MODEL_FILE)) diff --git a/cli/webcam.py b/cli/webcam.py new file mode 100644 index 0000000..301484b --- /dev/null +++ b/cli/webcam.py @@ -0,0 +1,66 @@ +import cv2 +import numpy as np + +WINDOW_TITLE = "Hold up a digit -- press q to quit" + + +def preprocess(region): + """Turn a captured square region into a 784-value array like MNIST's. + + MNIST digits are white strokes on a black background, so after + converting to grayscale and shrinking to 28x28, we invert the + brightness (ink-on-paper is normally dark-on-light) and scale + pixel values down to the [0, 1] range `load_mnist` uses. + """ + gray = cv2.cvtColor(region, cv2.COLOR_BGR2GRAY) + small = cv2.resize(gray, (28, 28)) + inverted = 255 - small + return (inverted.astype(np.float32) / 255.0).flatten() + + +def central_square(frame): + height, width = frame.shape[:2] + size = min(height, width) + top = (height - size) // 2 + left = (width - size) // 2 + return top, left, size + + +def run(clf): + capture = cv2.VideoCapture(0) + if not capture.isOpened(): + print("Could not open the webcam.") + return + + print("Hold a handwritten digit up to the camera, inside the box.") + print("Press 'q' (with the video window focused) to quit.\n") + + try: + while True: + found, frame = capture.read() + if not found: + break + frame = cv2.flip(frame, 1) + + top, left, size = central_square(frame) + region = frame[top:top + size, left:left + size] + pixels = preprocess(region) + + probabilities = clf.predict_proba([pixels])[0] + digit = probabilities.argmax() + confidence = probabilities[digit] + print(f"\rpredicted digit: {digit} confidence: {confidence:.2f} ", end="", flush=True) + + label = f"{digit} ({confidence:.0%})" + cv2.rectangle(frame, (left, top), (left + size, top + size), (0, 200, 0), 2) + cv2.putText(frame, label, (left, top - 12), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 200, 0), 2) + cv2.imshow(WINDOW_TITLE, frame) + + if cv2.waitKey(1) & 0xFF == ord("q"): + break + except KeyboardInterrupt: + pass + finally: + capture.release() + cv2.destroyAllWindows() + print() diff --git a/models/cnn.py b/models/cnn.py index e4e048e..db8f75a 100644 --- a/models/cnn.py +++ b/models/cnn.py @@ -72,9 +72,13 @@ class CNNClassifier: self._model = model return self - def predict(self, X): + def predict_proba(self, X): X_te = torch.tensor(X, dtype=torch.float32) self._model.eval() with torch.no_grad(): - predictions = self._model(X_te.to(self._device)).argmax(dim=1).cpu().numpy() - return predictions + logits = self._model(X_te.to(self._device)) + probabilities = torch.softmax(logits, dim=1).cpu().numpy() + return probabilities + + def predict(self, X): + return self.predict_proba(X).argmax(axis=1) diff --git a/models/handpicked.py b/models/features.py similarity index 93% rename from models/handpicked.py rename to models/features.py index bee010c..ebef8c8 100644 --- a/models/handpicked.py +++ b/models/features.py @@ -34,7 +34,7 @@ class FeatureExtractor: } -class HandPickedClassifier: +class FeatureClassifier: def fit(self, X, y): self._pipeline = Pipeline([ ("features", FeatureExtractor()), @@ -46,3 +46,6 @@ class HandPickedClassifier: def predict(self, X): return self._pipeline.predict(X) + + def predict_proba(self, X): + return self._pipeline.predict_proba(X) diff --git a/models/mlp.py b/models/mlp.py index 7254d19..2bbfc36 100644 --- a/models/mlp.py +++ b/models/mlp.py @@ -66,9 +66,13 @@ class MLPClassifier: self._model = model return self - def predict(self, X): + def predict_proba(self, X): X_te = torch.tensor(X, dtype=torch.float32) self._model.eval() with torch.no_grad(): - predictions = self._model(X_te.to(self._device)).argmax(dim=1).cpu().numpy() - return predictions + logits = self._model(X_te.to(self._device)) + probabilities = torch.softmax(logits, dim=1).cpu().numpy() + return probabilities + + def predict(self, X): + return self.predict_proba(X).argmax(axis=1) diff --git a/models/pixels.py b/models/pixels.py index e9bf018..0abb75f 100644 --- a/models/pixels.py +++ b/models/pixels.py @@ -9,3 +9,6 @@ class PixelClassifier: def predict(self, X): return self._classifier.predict(X) + + def predict_proba(self, X): + return self._classifier.predict_proba(X) diff --git a/pyproject.toml b/pyproject.toml index 0537a1f..a1d1e49 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,9 @@ version = "0.1.0" description = "Classification: Neural Networks lab" requires-python = ">=3.11" dependencies = [ + "joblib>=1.3", "numpy>=1.24", + "opencv-python>=4.13.0.92", "pandas>=2.0", "scikit-learn>=1.3", "torch>=2.0", diff --git a/questions.md b/questions.md index 962da14..76388a8 100644 --- a/questions.md +++ b/questions.md @@ -22,7 +22,7 @@ *Your answer:* -**4. Which digits were classified correctly most often? Which were most often confused for each other? (Use `digits models.handpicked.HandPickedClassifier -a` to see misclassified examples.)** +**4. Which digits were classified correctly most often? Which were most often confused for each other? (Use `digits models.features.FeatureClassifier -a` to see misclassified examples.)** *Your answer:* @@ -51,9 +51,29 @@ --- -## Checkpoint 3: Multi-Layer Perceptron +## Checkpoint 3: Single Hidden Layer -**9. Sketch your MLP architecture here (fill in the layer sizes you used):** +**9. Record your results for at least three combinations of hidden layer size and number of epochs:** + +| Hidden size | Epochs | Test accuracy | +|------------|--------|--------------| +| | | | +| | | | +| | | | + +**10. What happened with a very small hidden layer (8 or 16 neurons)? With a very large one (512)? What does that suggest about what the hidden layer is doing?** + +*Your answer:* + +**11. Each hidden neuron is a learned feature. How does the number of features available to this network compare to the one or two you designed by hand in Checkpoint 1—and how does that help explain the difference in accuracy?** + +*Your answer:* + +--- + +## Checkpoint 4: Multi-Layer Perceptron + +**12. Sketch your MLP architecture here (fill in the layer sizes you used):** ``` Input layer: _____ neurons (one per pixel) @@ -62,18 +82,18 @@ Hidden layer 2: _____ neurons, ReLU activation [if you used one] Output layer: _____ neurons (one per digit) ``` -**10. Record your best results:** +**13. Record your best results:** | Hidden sizes | Epochs | Val accuracy (final) | Test accuracy | F1 (macro) | |-------------|--------|---------------------|--------------|------------| | | | | | | | | | | | | -**11. Both the MLP and the pixel classifier see the same 784 numbers. What does the MLP do with them that the pixel classifier cannot?** +**14. Both the MLP and the pixel classifier see the same 784 numbers. What does the MLP do with them that the pixel classifier cannot?** *Your answer:* -**12. The MLP still flattens the image into a vector of 784 numbers before its first layer ever sees it—it has no idea that pixel 0 and pixel 28 are vertical neighbors. Did stacking layers fix the limitation you identified in Checkpoint 2, or just hide it better?** +**15. The MLP still flattens the image into a vector of 784 numbers before its first layer ever sees it—it has no idea that pixel 0 and pixel 28 are vertical neighbors. Did stacking layers fix the limitation you identified in Checkpoint 2, or just hide it better?** *Your answer:* @@ -81,7 +101,7 @@ Output layer: _____ neurons (one per digit) ## Final Questions -**13. Sketch the CNN architecture (label each layer with its type and dimensions):** +**16. Sketch the CNN architecture (label each layer with its type and dimensions):** ``` Input: ___x___x___ (height × width × channels) @@ -101,13 +121,13 @@ Fully connected: _____ → 10 Output: 10 class probabilities (softmax) ``` -**14. Record your CNN results:** +**17. Record your CNN results:** | Epochs | Val accuracy (final) | Test accuracy | F1 (macro) | |--------|---------------------|--------------|------------| | | | | | -**15. Fill in the final comparison table with every classifier you built in this lab:** +**18. Fill in the final comparison table with every classifier you built in this lab:** | Classifier | Hyperparameters | Test accuracy | F1 (macro) | Notes | |-----------|----------------|--------------|------------|-------| @@ -116,14 +136,14 @@ Output: 10 class probabilities (softmax) | MLP | hidden= | | | | | CNN | | | | | -**16. Architecture comparison: the MLP and CNN both ultimately process the same 784-pixel images, but the CNN reliably outperforms the MLP. What does the CNN know about images that the MLP does not?** +**19. Architecture comparison: the MLP and CNN both ultimately process the same 784-pixel images, but the CNN reliably outperforms the MLP. What does the CNN know about images that the MLP does not?** *Your answer:* -**17. Model selection: if you needed to deploy a digit classifier on a device with very limited memory and compute (e.g., a microcontroller), which classifier would you choose, and why? (Consider model size, prediction speed, and accuracy.)** +**20. Model selection: if you needed to deploy a digit classifier on a device with very limited memory and compute (e.g., a microcontroller), which classifier would you choose, and why? (Consider model size, prediction speed, and accuracy.)** *Your answer:* -**18. Real-world applications: CNNs are used for object detection, face recognition, and medical imaging. What properties of CNNs make them well suited for these applications—and what would have to change to handle images that aren't neatly centered and cropped, the way MNIST's are?** +**21. Real-world applications: CNNs are used for object detection, face recognition, and medical imaging. What properties of CNNs make them well suited for these applications—and what would have to change to handle images that aren't neatly centered and cropped, the way MNIST's are?** *Your answer:* diff --git a/uv.lock b/uv.lock index f3dafcc..9d93c85 100644 --- a/uv.lock +++ b/uv.lock @@ -15,7 +15,9 @@ name = "classification-neural" version = "0.1.0" source = { editable = "." } dependencies = [ + { name = "joblib" }, { name = "numpy" }, + { name = "opencv-python" }, { name = "pandas" }, { name = "scikit-learn" }, { name = "torch" }, @@ -24,7 +26,9 @@ dependencies = [ [package.metadata] requires-dist = [ + { name = "joblib", specifier = ">=1.3" }, { name = "numpy", specifier = ">=1.24" }, + { name = "opencv-python", specifier = ">=4.13.0.92" }, { name = "pandas", specifier = ">=2.0" }, { name = "scikit-learn", specifier = ">=1.3" }, { name = "torch", specifier = ">=2.0" }, @@ -470,6 +474,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a8/64/3708a90d1ebe202ffdeb7185f878a3c84d15c2b2c31858da2ce0583e2def/nvidia_nvtx-13.0.85-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cb7780edb6b14107373c835bf8b72e7a178bac7367e23da7acb108f973f157a6", size = 148878, upload-time = "2025-09-04T08:28:53.627Z" }, ] +[[package]] +name = "opencv-python" +version = "4.13.0.92" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/fc/6f/5a28fef4c4a382be06afe3938c64cc168223016fa520c5abaf37e8862aa5/opencv_python-4.13.0.92-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:caf60c071ec391ba51ed00a4a920f996d0b64e3e46068aac1f646b5de0326a19", size = 46247052, upload-time = "2026-02-05T07:01:25.046Z" }, + { url = "https://files.pythonhosted.org/packages/08/ac/6c98c44c650b8114a0fb901691351cfb3956d502e8e9b5cd27f4ee7fbf2f/opencv_python-4.13.0.92-cp37-abi3-macosx_14_0_x86_64.whl", hash = "sha256:5868a8c028a0b37561579bfb8ac1875babdc69546d236249fff296a8c010ccf9", size = 32568781, upload-time = "2026-02-05T07:01:41.379Z" }, + { url = "https://files.pythonhosted.org/packages/3e/51/82fed528b45173bf629fa44effb76dff8bc9f4eeaee759038362dfa60237/opencv_python-4.13.0.92-cp37-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0bc2596e68f972ca452d80f444bc404e08807d021fbba40df26b61b18e01838a", size = 47685527, upload-time = "2026-02-05T06:59:11.24Z" }, + { url = "https://files.pythonhosted.org/packages/db/07/90b34a8e2cf9c50fe8ed25cac9011cde0676b4d9d9c973751ac7616223a2/opencv_python-4.13.0.92-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:402033cddf9d294693094de5ef532339f14ce821da3ad7df7c9f6e8316da32cf", size = 70460872, upload-time = "2026-02-05T06:59:19.162Z" }, + { url = "https://files.pythonhosted.org/packages/02/6d/7a9cc719b3eaf4377b9c2e3edeb7ed3a81de41f96421510c0a169ca3cfd4/opencv_python-4.13.0.92-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:bccaabf9eb7f897ca61880ce2869dcd9b25b72129c28478e7f2a5e8dee945616", size = 46708208, upload-time = "2026-02-05T06:59:15.419Z" }, + { url = "https://files.pythonhosted.org/packages/fd/55/b3b49a1b97aabcfbbd6c7326df9cb0b6fa0c0aefa8e89d500939e04aa229/opencv_python-4.13.0.92-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:620d602b8f7d8b8dab5f4b99c6eb353e78d3fb8b0f53db1bd258bb1aa001c1d5", size = 72927042, upload-time = "2026-02-05T06:59:23.389Z" }, + { url = "https://files.pythonhosted.org/packages/fb/17/de5458312bcb07ddf434d7bfcb24bb52c59635ad58c6e7c751b48949b009/opencv_python-4.13.0.92-cp37-abi3-win32.whl", hash = "sha256:372fe164a3148ac1ca51e5f3ad0541a4a276452273f503441d718fab9c5e5f59", size = 30932638, upload-time = "2026-02-05T07:02:14.98Z" }, + { url = "https://files.pythonhosted.org/packages/e9/a5/1be1516390333ff9be3a9cb648c9f33df79d5096e5884b5df71a588af463/opencv_python-4.13.0.92-cp37-abi3-win_amd64.whl", hash = "sha256:423d934c9fafb91aad38edf26efb46da91ffbc05f3f59c4b0c72e699720706f5", size = 40212062, upload-time = "2026-02-05T07:02:12.724Z" }, +] + [[package]] name = "pandas" version = "3.0.3"