Initial commit

2025-09-24 13:13:58 +00:00
commit 5fa99b714b
8 changed files with 856 additions and 0 deletions
--- a/text_codecs/alphanumeric.py
+++ b/text_codecs/alphanumeric.py
@@ -0,0 +1,39 @@
+import string
+import codecs
+from custom_codecs.register import register_codec
+from easybits import Bits
+
+allowed_characters = string.ascii_letters + string.digits
+
+def encode(text):
+    """A (very) lossy encoder which only saves ASCII letters, numbers, and spaces.
+    Everything else is discarded. All whitespace (e.g. tabs) is converted into spaces.
+    """
+    ascii_characters = []
+    last_character_was_space = False
+    for char in text:
+        if char in allowed_characters:
+            ascii_characters.append(char)
+            last_character_was_space = False
+        elif char in string.whitespace and not last_character_was_space:
+            ascii_characters.append(' ')
+            last_character_was_space = True
+    ascii_text = ''.join(ascii_characters)
+    return Bits(ascii_text).bytes
+
+def decode(data):
+    """A decoder which reads bytes and returns (string, length), 
+    where length is the length of bytes consumed
+    """
+    text = ""
+    for i, byte in enumerate(data):
+        try:
+            text += Bits(byte, length=8).ascii
+        except OverflowError:
+            print(i, byte)
+            print(text + '|')
+            print("Error")
+            breakpoint()
+    return text
+    
+register_codec(encode, decode, "alphanumeric")
--- a/text_codecs/ascii7.py
+++ b/text_codecs/ascii7.py
@@ -0,0 +1,33 @@
+from custom_codecs.register import register_codec
+from easybits import Bits
+
+def encode(text):
+    """An encoder which only handles ASCII: non-ASCII characters
+    are replaced with '?'. Once all the characters are ASCII, this encoder
+    writes the first seven bits of each byte (the first bit of every 
+    ASCII character is 0, so we can easily reconstruct the full byte 
+    when we decode). Therefore, this encoder compresses ASCII text into
+    7/8 of the usual size.
+    """
+    result = Bits()
+    for char in text:
+        try:
+            b = Bits(char, encoding='ascii')
+        except UnicodeEncodeError:
+            b = Bits('?')
+        result = result.concat(b[1:])
+    return result.bytes
+
+def decode(data):
+    """The matching decoder. Reads seven bits at a time, putting a 0 on 
+    the front to create a full byte. Then converts this byte into its 
+    ASCII representation.
+    """
+    bits = Bits(bytes(data))
+    text = ""
+    for i in range(0, len(bits), 7):
+        byte = Bits('0').concat(bits[i:i+7])
+        text += Bits(byte).ascii
+    return text
+
+register_codec(encode, decode, "ascii7")
--- a/text_codecs/evaluate.py
+++ b/text_codecs/evaluate.py
@@ -0,0 +1,75 @@
+from argparse import ArgumentParser
+from pathlib import Path
+import codecs
+import requests
+from tabulate import tabulate
+import sys
+import shutil
+
+codecs_dir = "text_codecs"
+
+def evaluate_encoding(encoding, text_path):
+    ""
+    print(f"Evaluating encoding {encoding}")
+    try:
+        codecs.lookup(encoding)
+    except LookupError:
+        __import__(encoding)
+    text = Path(text_path).read_text()
+    compressed_path = Path(text_path).with_suffix('.' + encoding)
+    if compressed_path.exists():
+        compressed = compressed_path.read_bytes()
+    else:
+        compressed = text.encode(encoding)
+        compressed_path.write_bytes(compressed)
+    original_size = len(text.encode('utf8'))
+    compressed_size = len(compressed)
+    return {
+        'encoding': encoding, 
+        'original': original_size,
+        'compressed': compressed_size,
+        'compression_rate': compressed_size / original_size
+    }
+
+def inspect_encoded_text(encoding, text_path):
+    try:
+        codecs.lookup(encoding)
+    except LookupError:
+        __import__(encoding)
+    compressed_path = Path(text_path).with_suffix('.' + encoding)
+    if not compressed_path.exists():
+        compressed = Path(text_path).read_text().encode(encoding)
+        compressed_path.write_bytes(compressed)
+    with open(compressed_path, 'rb') as fh:
+        text = fh.read().decode(encoding)
+        return text
+
+text_dir = Path("texts")
+default_text = "texts/little_women.txt"
+default_text_url = "https://www.gutenberg.org/cache/epub/37106/pg37106.txt"
+
+parser = ArgumentParser("A command-line tool which measures compression rates.")
+parser.add_argument("encodings", nargs="*")
+parser.add_argument("--inspect", "-i", help="See resulting text for a codec")
+parser.add_argument("--text", "-t", default="texts/little_women.txt", help="Text file to use for testing")
+parser.add_argument("--clean", "-c", action="store_true", help="Remove all generated files")
+args = parser.parse_args()
+if args.clean:
+    shutil.rmtree(text_dir)
+if not text_dir.exists():
+    text_dir.mkdir()
+if not Path(args.text).exists():
+    if args.text == default_text:
+        print("Downloading 'little_women.txt' from Project Gutenberg...")
+        response = requests.get(default_text_url, stream=True)
+        with open(default_text, 'wb') as f:
+            f.write(response.content)
+    else:
+        raise ValueError(f"{args.text} does not exist")
+sys.path.append(codecs_dir)
+if args.encodings:
+    results = [evaluate_encoding(e, args.text) for e in args.encodings]
+    print(tabulate(results, headers="keys"))
+if args.inspect:
+    print(inspect_encoded_text(args.inspect, args.text))
+
--- a/text_codecs/register.py
+++ b/text_codecs/register.py
@@ -0,0 +1,17 @@
+import codecs
+
+def register_codec(encode, decode, name):
+    """Registers a codec so that it can later be used to encode
+    or decode strings and bytes.
+    """
+    def encode_wrapper(text):
+        return encode(text), len(text)
+
+    def decode_wrapper(data):
+        return decode(data), len(data)
+
+    def search_for_codec(query):
+        if query == name:
+            return codecs.CodecInfo(encode_wrapper, decode_wrapper, name=name)
+
+    codecs.register(search_for_codec)