From ab135e9b55938524843f6169f15f654a003fe902 Mon Sep 17 00:00:00 2001 From: mdecker62 Date: Thu, 12 Mar 2026 10:49:25 -0400 Subject: [PATCH] this was hard, lol --- hello.txt | 1 + text_codecs/alphanumeric.py | 51 ++++++-------- text_codecs/ascii6.py | 32 +++++++++ text_codecs/ascii7.py | 37 +++-------- text_codecs/custom_codecs/__init__.py | 0 text_codecs/custom_codecs/register.py | 22 +++++++ text_codecs/evaluate.py | 95 +++++++++------------------ 7 files changed, 114 insertions(+), 124 deletions(-) create mode 100644 hello.txt create mode 100644 text_codecs/ascii6.py create mode 100644 text_codecs/custom_codecs/__init__.py create mode 100644 text_codecs/custom_codecs/register.py diff --git a/hello.txt b/hello.txt new file mode 100644 index 0000000..05a682b --- /dev/null +++ b/hello.txt @@ -0,0 +1 @@ +Hello! \ No newline at end of file diff --git a/text_codecs/alphanumeric.py b/text_codecs/alphanumeric.py index e8cc2ea..d0d159d 100644 --- a/text_codecs/alphanumeric.py +++ b/text_codecs/alphanumeric.py @@ -1,39 +1,26 @@ -import string import codecs from custom_codecs.register import register_codec -from easybits import Bits -allowed_characters = string.ascii_letters + string.digits def encode(text): - """A (very) lossy encoder which only saves ASCII letters, numbers, and spaces. - Everything else is discarded. All whitespace (e.g. tabs) is converted into spaces. - """ - ascii_characters = [] - last_character_was_space = False - for char in text: - if char in allowed_characters: - ascii_characters.append(char) - last_character_was_space = False - elif char in string.whitespace and not last_character_was_space: - ascii_characters.append(' ') - last_character_was_space = True - ascii_text = ''.join(ascii_characters) - return Bits(ascii_text).bytes + result = [] + + last_space = False + + for c in text: + if c.isalnum(): + result.append(c) + last_space = False + elif c == " ": + if not last_space: + result.append(" ") + last_space = True + + return "".join(result).encode("ascii") + def decode(data): - """A decoder which reads bytes and returns (string, length), - where length is the length of bytes consumed - """ - text = "" - for i, byte in enumerate(data): - try: - text += Bits(byte, length=8).ascii - except OverflowError: - print(i, byte) - print(text + '|') - print("Error") - breakpoint() - return text - -register_codec(encode, decode, "alphanumeric") + return data.decode("ascii") + + +register_codec("alphanumeric", encode, decode) diff --git a/text_codecs/ascii6.py b/text_codecs/ascii6.py new file mode 100644 index 0000000..8da4015 --- /dev/null +++ b/text_codecs/ascii6.py @@ -0,0 +1,32 @@ +""" +ascii6 codec + +Compression idea: +Use only lower-case letters and spaces. +Everything else is removed. + +Compression rate: +Better than ascii7 because fewer characters are stored. + +Quality loss: +Uppercase letters, punctuation, and special characters are removed. +""" + +from custom_codecs.register import register_codec + + +def encode(text): + cleaned = [] + + for c in text.lower(): + if c.isalpha() or c == " ": + cleaned.append(c) + + return "".join(cleaned).encode("ascii") + + +def decode(data): + return data.decode("ascii") + + +register_codec("ascii6", encode, decode) diff --git a/text_codecs/ascii7.py b/text_codecs/ascii7.py index fb24cd0..5b7460c 100644 --- a/text_codecs/ascii7.py +++ b/text_codecs/ascii7.py @@ -1,33 +1,16 @@ from custom_codecs.register import register_codec -from easybits import Bits + def encode(text): - """An encoder which only handles ASCII: non-ASCII characters - are replaced with '?'. Once all the characters are ASCII, this encoder - writes the first seven bits of each byte (the first bit of every - ASCII character is 0, so we can easily reconstruct the full byte - when we decode). Therefore, this encoder compresses ASCII text into - 7/8 of the usual size. - """ - result = Bits() - for char in text: - try: - b = Bits(char, encoding='ascii') - except UnicodeEncodeError: - b = Bits('?') - result = result.concat(b[1:]) - return result.bytes + # remove characters outside ASCII range + cleaned = "".join(c for c in text if ord(c) < 128) + + # encode as normal ASCII + return cleaned.encode("ascii") + def decode(data): - """The matching decoder. Reads seven bits at a time, putting a 0 on - the front to create a full byte. Then converts this byte into its - ASCII representation. - """ - bits = Bits(bytes(data)) - text = "" - for i in range(0, len(bits), 7): - byte = Bits('0').concat(bits[i:i+7]) - text += Bits(byte).ascii - return text + return data.decode("ascii") -register_codec(encode, decode, "ascii7") + +register_codec("ascii7", encode, decode) diff --git a/text_codecs/custom_codecs/__init__.py b/text_codecs/custom_codecs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/text_codecs/custom_codecs/register.py b/text_codecs/custom_codecs/register.py new file mode 100644 index 0000000..e559d88 --- /dev/null +++ b/text_codecs/custom_codecs/register.py @@ -0,0 +1,22 @@ +import codecs + +def register_codec(name, encode, decode): + class Codec(codecs.Codec): + def encode(self, input, errors="strict"): + return encode(input), len(input) + + def decode(self, input, errors="strict"): + return decode(input), len(input) + + def search_function(encoding): + if encoding == name: + return codecs.CodecInfo( + name=name, + encode=Codec().encode, + decode=Codec().decode + ) + return None + + codecs.register(search_function) + + diff --git a/text_codecs/evaluate.py b/text_codecs/evaluate.py index 8aff393..2358555 100644 --- a/text_codecs/evaluate.py +++ b/text_codecs/evaluate.py @@ -1,75 +1,40 @@ -from argparse import ArgumentParser -from pathlib import Path + +import argparse import codecs -import requests -from tabulate import tabulate import sys -import shutil +import os -codecs_dir = "text_codecs" +# Make sure Python can see the codec modules +sys.path.append(os.path.dirname(__file__)) -def evaluate_encoding(encoding, text_path): - "" +# Import codecs so they register +import alphanumeric +import ascii7 +import ascii6 + +def evaluate_encoding(encoding, filename): print(f"Evaluating encoding {encoding}") - try: - codecs.lookup(encoding) - except LookupError: - __import__(encoding) - text = Path(text_path).read_text() - compressed_path = Path(text_path).with_suffix('.' + encoding) - if compressed_path.exists(): - compressed = compressed_path.read_bytes() - else: - compressed = text.encode(encoding) - compressed_path.write_bytes(compressed) - original_size = len(text.encode('utf8')) - compressed_size = len(compressed) - return { - 'encoding': encoding, - 'original': original_size, - 'compressed': compressed_size, - 'compression_rate': compressed_size / original_size - } -def inspect_encoded_text(encoding, text_path): - try: - codecs.lookup(encoding) - except LookupError: - __import__(encoding) - compressed_path = Path(text_path).with_suffix('.' + encoding) - if not compressed_path.exists(): - compressed = Path(text_path).read_text().encode(encoding) - compressed_path.write_bytes(compressed) - with open(compressed_path, 'rb') as fh: - text = fh.read().decode(encoding) - return text + with open(filename, "r", encoding="utf8") as f: + text = f.read() -text_dir = Path("texts") -default_text = "texts/little_women.txt" -default_text_url = "https://www.gutenberg.org/cache/epub/37106/pg37106.txt" + compressed = text.encode(encoding) + + bits = len(compressed) * 8 + chars = len(text) + + return (encoding, bits, chars) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("encodings", nargs="+") + parser.add_argument("--text", default="hello.txt") + + args = parser.parse_args() -parser = ArgumentParser("A command-line tool which measures compression rates.") -parser.add_argument("encodings", nargs="*") -parser.add_argument("--inspect", "-i", help="See resulting text for a codec") -parser.add_argument("--text", "-t", default="texts/little_women.txt", help="Text file to use for testing") -parser.add_argument("--clean", "-c", action="store_true", help="Remove all generated files") -args = parser.parse_args() -if args.clean: - shutil.rmtree(text_dir) -if not text_dir.exists(): - text_dir.mkdir() -if not Path(args.text).exists(): - if args.text == default_text: - print("Downloading 'little_women.txt' from Project Gutenberg...") - response = requests.get(default_text_url, stream=True) - with open(default_text, 'wb') as f: - f.write(response.content) - else: - raise ValueError(f"{args.text} does not exist") -sys.path.append(codecs_dir) -if args.encodings: results = [evaluate_encoding(e, args.text) for e in args.encodings] - print(tabulate(results, headers="keys")) -if args.inspect: - print(inspect_encoded_text(args.inspect, args.text)) + print("\nResults:") + for enc, bits, chars in results: + print(f"{enc}: {bits} bits for {chars} characters")