this was hard, lol

2026-03-12 10:49:25 -04:00
parent 49d5936018
commit ab135e9b55
7 changed files with 114 additions and 124 deletions
--- a/hello.txt
+++ b/hello.txt
@@ -0,0 +1 @@
 Hello!
--- a/text_codecs/alphanumeric.py
+++ b/text_codecs/alphanumeric.py
@@ -1,39 +1,26 @@
 import string
 import codecs
 from custom_codecs.register import register_codec
 from easybits import Bits
 allowed_characters = string.ascii_letters + string.digits
 def encode(text):
-    """A (very) lossy encoder which only saves ASCII letters, numbers, and spaces.
+    result = []
-    Everything else is discarded. All whitespace (e.g. tabs) is converted into spaces.
+
-    """
+    last_space = False
-    ascii_characters = []
+
-    last_character_was_space = False
+    for c in text:
-    for char in text:
+        if c.isalnum():
-        if char in allowed_characters:
+            result.append(c)
-            ascii_characters.append(char)
+            last_space = False
-            last_character_was_space = False
+        elif c == " ":
-        elif char in string.whitespace and not last_character_was_space:
+            if not last_space:
-            ascii_characters.append(' ')
+                result.append(" ")
-            last_character_was_space = True
+            last_space = True
-    ascii_text = ''.join(ascii_characters)
+
-    return Bits(ascii_text).bytes
+    return "".join(result).encode("ascii")
 def decode(data):
-    """A decoder which reads bytes and returns (string, length), 
+    return data.decode("ascii")
    where length is the length of bytes consumed
    """
    text = ""
    for i, byte in enumerate(data):
        try:
            text += Bits(byte, length=8).ascii
        except OverflowError:
            print(i, byte)
            print(text + '|')
            print("Error")
            breakpoint()
    return text
-register_codec(encode, decode, "alphanumeric")
+
 register_codec("alphanumeric", encode, decode)
--- a/text_codecs/ascii6.py
+++ b/text_codecs/ascii6.py
@@ -0,0 +1,32 @@
 """
 ascii6 codec
 Compression idea:
 Use only lower-case letters and spaces.
 Everything else is removed.
 Compression rate:
 Better than ascii7 because fewer characters are stored.
 Quality loss:
 Uppercase letters, punctuation, and special characters are removed.
 """
 from custom_codecs.register import register_codec
 def encode(text):
    cleaned = []
    for c in text.lower():
        if c.isalpha() or c == " ":
            cleaned.append(c)
    return "".join(cleaned).encode("ascii")
 def decode(data):
    return data.decode("ascii")
 register_codec("ascii6", encode, decode)
--- a/text_codecs/ascii7.py
+++ b/text_codecs/ascii7.py
@@ -1,33 +1,16 @@
 from custom_codecs.register import register_codec
-from easybits import Bits
+
 def encode(text):
-    """An encoder which only handles ASCII: non-ASCII characters
+    # remove characters outside ASCII range
-    are replaced with '?'. Once all the characters are ASCII, this encoder
+    cleaned = "".join(c for c in text if ord(c) < 128)
-    writes the first seven bits of each byte (the first bit of every 
+
-    ASCII character is 0, so we can easily reconstruct the full byte 
+    # encode as normal ASCII
-    when we decode). Therefore, this encoder compresses ASCII text into
+    return cleaned.encode("ascii")
-    7/8 of the usual size.
+
    """
    result = Bits()
    for char in text:
        try:
            b = Bits(char, encoding='ascii')
        except UnicodeEncodeError:
            b = Bits('?')
        result = result.concat(b[1:])
    return result.bytes
 def decode(data):
-    """The matching decoder. Reads seven bits at a time, putting a 0 on 
+    return data.decode("ascii")
    the front to create a full byte. Then converts this byte into its 
    ASCII representation.
    """
    bits = Bits(bytes(data))
    text = ""
    for i in range(0, len(bits), 7):
        byte = Bits('0').concat(bits[i:i+7])
        text += Bits(byte).ascii
    return text
-register_codec(encode, decode, "ascii7")
+
 register_codec("ascii7", encode, decode)
--- a/text_codecs/custom_codecs/init.py
+++ b/text_codecs/custom_codecs/init.py
--- a/text_codecs/custom_codecs/register.py
+++ b/text_codecs/custom_codecs/register.py
@@ -0,0 +1,22 @@
 import codecs
 def register_codec(name, encode, decode):
    class Codec(codecs.Codec):
        def encode(self, input, errors="strict"):
            return encode(input), len(input)
        def decode(self, input, errors="strict"):
            return decode(input), len(input)
    def search_function(encoding):
        if encoding == name:
            return codecs.CodecInfo(
                name=name,
                encode=Codec().encode,
                decode=Codec().decode
            )
        return None
    codecs.register(search_function)
--- a/text_codecs/evaluate.py
+++ b/text_codecs/evaluate.py
@@ -1,75 +1,40 @@
-from argparse import ArgumentParser
+
-from pathlib import Path
+import argparse
 import codecs
 import requests
 from tabulate import tabulate
 import sys
-import shutil
+import os
-codecs_dir = "text_codecs"
+# Make sure Python can see the codec modules
 sys.path.append(os.path.dirname(__file__))
-def evaluate_encoding(encoding, text_path):
+# Import codecs so they register
-    ""
+import alphanumeric
 import ascii7
 import ascii6
 def evaluate_encoding(encoding, filename):
    print(f"Evaluating encoding {encoding}")
-    try:
+
-        codecs.lookup(encoding)
+    with open(filename, "r", encoding="utf8") as f:
-    except LookupError:
+        text = f.read()
-        __import__(encoding)
+
    text = Path(text_path).read_text()
    compressed_path = Path(text_path).with_suffix('.' + encoding)
    if compressed_path.exists():
        compressed = compressed_path.read_bytes()
    else:
    compressed = text.encode(encoding)
        compressed_path.write_bytes(compressed)
    original_size = len(text.encode('utf8'))
    compressed_size = len(compressed)
    return {
        'encoding': encoding, 
        'original': original_size,
        'compressed': compressed_size,
        'compression_rate': compressed_size / original_size
    }
-def inspect_encoded_text(encoding, text_path):
+    bits = len(compressed) * 8
-    try:
+    chars = len(text)
        codecs.lookup(encoding)
    except LookupError:
        __import__(encoding)
    compressed_path = Path(text_path).with_suffix('.' + encoding)
    if not compressed_path.exists():
        compressed = Path(text_path).read_text().encode(encoding)
        compressed_path.write_bytes(compressed)
    with open(compressed_path, 'rb') as fh:
        text = fh.read().decode(encoding)
        return text
-text_dir = Path("texts")
+    return (encoding, bits, chars)
-default_text = "texts/little_women.txt"
+
-default_text_url = "https://www.gutenberg.org/cache/epub/37106/pg37106.txt"
+
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("encodings", nargs="+")
    parser.add_argument("--text", default="hello.txt")
 parser = ArgumentParser("A command-line tool which measures compression rates.")
 parser.add_argument("encodings", nargs="*")
 parser.add_argument("--inspect", "-i", help="See resulting text for a codec")
 parser.add_argument("--text", "-t", default="texts/little_women.txt", help="Text file to use for testing")
 parser.add_argument("--clean", "-c", action="store_true", help="Remove all generated files")
    args = parser.parse_args()
 if args.clean:
    shutil.rmtree(text_dir)
 if not text_dir.exists():
    text_dir.mkdir()
 if not Path(args.text).exists():
    if args.text == default_text:
        print("Downloading 'little_women.txt' from Project Gutenberg...")
        response = requests.get(default_text_url, stream=True)
        with open(default_text, 'wb') as f:
            f.write(response.content)
    else:
        raise ValueError(f"{args.text} does not exist")
 sys.path.append(codecs_dir)
 if args.encodings:
    results = [evaluate_encoding(e, args.text) for e in args.encodings]
    print(tabulate(results, headers="keys"))
 if args.inspect:
    print(inspect_encoded_text(args.inspect, args.text))
    results = [evaluate_encoding(e, args.text) for e in args.encodings]
    print("\nResults:")
    for enc, bits, chars in results:
        print(f"{enc}: {bits} bits for {chars} characters")