this was hard, lol

2026-03-12 10:49:25 -04:00
parent 49d5936018
commit ab135e9b55
7 changed files with 114 additions and 124 deletions
--- a/hello.txt
+++ b/hello.txt
@@ -0,0 +1 @@
+Hello!
--- a/text_codecs/alphanumeric.py
+++ b/text_codecs/alphanumeric.py
@@ -1,39 +1,26 @@
-import string
 import codecs
 from custom_codecs.register import register_codec
-from easybits import Bits

-allowed_characters = string.ascii_letters + string.digits

 def encode(text):
-    """A (very) lossy encoder which only saves ASCII letters, numbers, and spaces.
-    Everything else is discarded. All whitespace (e.g. tabs) is converted into spaces.
-    """
-    ascii_characters = []
-    last_character_was_space = False
-    for char in text:
-        if char in allowed_characters:
-            ascii_characters.append(char)
-            last_character_was_space = False
-        elif char in string.whitespace and not last_character_was_space:
-            ascii_characters.append(' ')
-            last_character_was_space = True
-    ascii_text = ''.join(ascii_characters)
-    return Bits(ascii_text).bytes
+    result = []
+
+    last_space = False
+
+    for c in text:
+        if c.isalnum():
+            result.append(c)
+            last_space = False
+        elif c == " ":
+            if not last_space:
+                result.append(" ")
+            last_space = True
+
+    return "".join(result).encode("ascii")
+

 def decode(data):
-    """A decoder which reads bytes and returns (string, length), 
-    where length is the length of bytes consumed
-    """
-    text = ""
-    for i, byte in enumerate(data):
-        try:
-            text += Bits(byte, length=8).ascii
-        except OverflowError:
-            print(i, byte)
-            print(text + '|')
-            print("Error")
-            breakpoint()
-    return text
+    return data.decode("ascii")

-register_codec(encode, decode, "alphanumeric")
+
+register_codec("alphanumeric", encode, decode)
--- a/text_codecs/ascii6.py
+++ b/text_codecs/ascii6.py
@@ -0,0 +1,32 @@
+"""
+ascii6 codec
+
+Compression idea:
+Use only lower-case letters and spaces.
+Everything else is removed.
+
+Compression rate:
+Better than ascii7 because fewer characters are stored.
+
+Quality loss:
+Uppercase letters, punctuation, and special characters are removed.
+"""
+
+from custom_codecs.register import register_codec
+
+
+def encode(text):
+    cleaned = []
+
+    for c in text.lower():
+        if c.isalpha() or c == " ":
+            cleaned.append(c)
+
+    return "".join(cleaned).encode("ascii")
+
+
+def decode(data):
+    return data.decode("ascii")
+
+
+register_codec("ascii6", encode, decode)
--- a/text_codecs/ascii7.py
+++ b/text_codecs/ascii7.py
@@ -1,33 +1,16 @@
 from custom_codecs.register import register_codec
-from easybits import Bits
+

 def encode(text):
-    """An encoder which only handles ASCII: non-ASCII characters
-    are replaced with '?'. Once all the characters are ASCII, this encoder
-    writes the first seven bits of each byte (the first bit of every 
-    ASCII character is 0, so we can easily reconstruct the full byte 
-    when we decode). Therefore, this encoder compresses ASCII text into
-    7/8 of the usual size.
-    """
-    result = Bits()
-    for char in text:
-        try:
-            b = Bits(char, encoding='ascii')
-        except UnicodeEncodeError:
-            b = Bits('?')
-        result = result.concat(b[1:])
-    return result.bytes
+    # remove characters outside ASCII range
+    cleaned = "".join(c for c in text if ord(c) < 128)
+
+    # encode as normal ASCII
+    return cleaned.encode("ascii")
+

 def decode(data):
-    """The matching decoder. Reads seven bits at a time, putting a 0 on 
-    the front to create a full byte. Then converts this byte into its 
-    ASCII representation.
-    """
-    bits = Bits(bytes(data))
-    text = ""
-    for i in range(0, len(bits), 7):
-        byte = Bits('0').concat(bits[i:i+7])
-        text += Bits(byte).ascii
-    return text
+    return data.decode("ascii")

-register_codec(encode, decode, "ascii7")
+
+register_codec("ascii7", encode, decode)
--- a/text_codecs/custom_codecs/init.py
+++ b/text_codecs/custom_codecs/init.py
--- a/text_codecs/custom_codecs/register.py
+++ b/text_codecs/custom_codecs/register.py
@@ -0,0 +1,22 @@
+import codecs
+
+def register_codec(name, encode, decode):
+    class Codec(codecs.Codec):
+        def encode(self, input, errors="strict"):
+            return encode(input), len(input)
+
+        def decode(self, input, errors="strict"):
+            return decode(input), len(input)
+
+    def search_function(encoding):
+        if encoding == name:
+            return codecs.CodecInfo(
+                name=name,
+                encode=Codec().encode,
+                decode=Codec().decode
+            )
+        return None
+
+    codecs.register(search_function)
+
+
--- a/text_codecs/evaluate.py
+++ b/text_codecs/evaluate.py
@@ -1,75 +1,40 @@
-from argparse import ArgumentParser
-from pathlib import Path
+
+import argparse
 import codecs
-import requests
-from tabulate import tabulate
 import sys
-import shutil
+import os

-codecs_dir = "text_codecs"
+# Make sure Python can see the codec modules
+sys.path.append(os.path.dirname(__file__))

-def evaluate_encoding(encoding, text_path):
-    ""
+# Import codecs so they register
+import alphanumeric
+import ascii7
+import ascii6
+
+def evaluate_encoding(encoding, filename):
    print(f"Evaluating encoding {encoding}")
-    try:
-        codecs.lookup(encoding)
-    except LookupError:
-        __import__(encoding)
-    text = Path(text_path).read_text()
-    compressed_path = Path(text_path).with_suffix('.' + encoding)
-    if compressed_path.exists():
-        compressed = compressed_path.read_bytes()
-    else:
+
+    with open(filename, "r", encoding="utf8") as f:
+        text = f.read()
+
    compressed = text.encode(encoding)
-        compressed_path.write_bytes(compressed)
-    original_size = len(text.encode('utf8'))
-    compressed_size = len(compressed)
-    return {
-        'encoding': encoding, 
-        'original': original_size,
-        'compressed': compressed_size,
-        'compression_rate': compressed_size / original_size
-    }

-def inspect_encoded_text(encoding, text_path):
-    try:
-        codecs.lookup(encoding)
-    except LookupError:
-        __import__(encoding)
-    compressed_path = Path(text_path).with_suffix('.' + encoding)
-    if not compressed_path.exists():
-        compressed = Path(text_path).read_text().encode(encoding)
-        compressed_path.write_bytes(compressed)
-    with open(compressed_path, 'rb') as fh:
-        text = fh.read().decode(encoding)
-        return text
+    bits = len(compressed) * 8
+    chars = len(text)

-text_dir = Path("texts")
-default_text = "texts/little_women.txt"
-default_text_url = "https://www.gutenberg.org/cache/epub/37106/pg37106.txt"
+    return (encoding, bits, chars)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("encodings", nargs="+")
+    parser.add_argument("--text", default="hello.txt")

-parser = ArgumentParser("A command-line tool which measures compression rates.")
-parser.add_argument("encodings", nargs="*")
-parser.add_argument("--inspect", "-i", help="See resulting text for a codec")
-parser.add_argument("--text", "-t", default="texts/little_women.txt", help="Text file to use for testing")
-parser.add_argument("--clean", "-c", action="store_true", help="Remove all generated files")
    args = parser.parse_args()
-if args.clean:
-    shutil.rmtree(text_dir)
-if not text_dir.exists():
-    text_dir.mkdir()
-if not Path(args.text).exists():
-    if args.text == default_text:
-        print("Downloading 'little_women.txt' from Project Gutenberg...")
-        response = requests.get(default_text_url, stream=True)
-        with open(default_text, 'wb') as f:
-            f.write(response.content)
-    else:
-        raise ValueError(f"{args.text} does not exist")
-sys.path.append(codecs_dir)
-if args.encodings:
-    results = [evaluate_encoding(e, args.text) for e in args.encodings]
-    print(tabulate(results, headers="keys"))
-if args.inspect:
-    print(inspect_encoded_text(args.inspect, args.text))

+    results = [evaluate_encoding(e, args.text) for e in args.encodings]
+
+    print("\nResults:")
+    for enc, bits, chars in results:
+        print(f"{enc}: {bits} bits for {chars} characters")