this was hard, lol

This commit is contained in:
mdecker62
2026-03-12 10:49:25 -04:00
parent 49d5936018
commit ab135e9b55
7 changed files with 114 additions and 124 deletions

1
hello.txt Normal file
View File

@@ -0,0 +1 @@
Hello!

View File

@@ -1,39 +1,26 @@
import string
import codecs import codecs
from custom_codecs.register import register_codec from custom_codecs.register import register_codec
from easybits import Bits
allowed_characters = string.ascii_letters + string.digits
def encode(text): def encode(text):
"""A (very) lossy encoder which only saves ASCII letters, numbers, and spaces. result = []
Everything else is discarded. All whitespace (e.g. tabs) is converted into spaces.
""" last_space = False
ascii_characters = []
last_character_was_space = False for c in text:
for char in text: if c.isalnum():
if char in allowed_characters: result.append(c)
ascii_characters.append(char) last_space = False
last_character_was_space = False elif c == " ":
elif char in string.whitespace and not last_character_was_space: if not last_space:
ascii_characters.append(' ') result.append(" ")
last_character_was_space = True last_space = True
ascii_text = ''.join(ascii_characters)
return Bits(ascii_text).bytes return "".join(result).encode("ascii")
def decode(data): def decode(data):
"""A decoder which reads bytes and returns (string, length), return data.decode("ascii")
where length is the length of bytes consumed
"""
text = ""
for i, byte in enumerate(data):
try:
text += Bits(byte, length=8).ascii
except OverflowError:
print(i, byte)
print(text + '|')
print("Error")
breakpoint()
return text
register_codec(encode, decode, "alphanumeric")
register_codec("alphanumeric", encode, decode)

32
text_codecs/ascii6.py Normal file
View File

@@ -0,0 +1,32 @@
"""
ascii6 codec
Compression idea:
Use only lower-case letters and spaces.
Everything else is removed.
Compression rate:
Better than ascii7 because fewer characters are stored.
Quality loss:
Uppercase letters, punctuation, and special characters are removed.
"""
from custom_codecs.register import register_codec
def encode(text):
cleaned = []
for c in text.lower():
if c.isalpha() or c == " ":
cleaned.append(c)
return "".join(cleaned).encode("ascii")
def decode(data):
return data.decode("ascii")
register_codec("ascii6", encode, decode)

View File

@@ -1,33 +1,16 @@
from custom_codecs.register import register_codec from custom_codecs.register import register_codec
from easybits import Bits
def encode(text): def encode(text):
"""An encoder which only handles ASCII: non-ASCII characters # remove characters outside ASCII range
are replaced with '?'. Once all the characters are ASCII, this encoder cleaned = "".join(c for c in text if ord(c) < 128)
writes the first seven bits of each byte (the first bit of every
ASCII character is 0, so we can easily reconstruct the full byte # encode as normal ASCII
when we decode). Therefore, this encoder compresses ASCII text into return cleaned.encode("ascii")
7/8 of the usual size.
"""
result = Bits()
for char in text:
try:
b = Bits(char, encoding='ascii')
except UnicodeEncodeError:
b = Bits('?')
result = result.concat(b[1:])
return result.bytes
def decode(data): def decode(data):
"""The matching decoder. Reads seven bits at a time, putting a 0 on return data.decode("ascii")
the front to create a full byte. Then converts this byte into its
ASCII representation.
"""
bits = Bits(bytes(data))
text = ""
for i in range(0, len(bits), 7):
byte = Bits('0').concat(bits[i:i+7])
text += Bits(byte).ascii
return text
register_codec(encode, decode, "ascii7")
register_codec("ascii7", encode, decode)

View File

View File

@@ -0,0 +1,22 @@
import codecs
def register_codec(name, encode, decode):
class Codec(codecs.Codec):
def encode(self, input, errors="strict"):
return encode(input), len(input)
def decode(self, input, errors="strict"):
return decode(input), len(input)
def search_function(encoding):
if encoding == name:
return codecs.CodecInfo(
name=name,
encode=Codec().encode,
decode=Codec().decode
)
return None
codecs.register(search_function)

View File

@@ -1,75 +1,40 @@
from argparse import ArgumentParser
from pathlib import Path import argparse
import codecs import codecs
import requests
from tabulate import tabulate
import sys import sys
import shutil import os
codecs_dir = "text_codecs" # Make sure Python can see the codec modules
sys.path.append(os.path.dirname(__file__))
def evaluate_encoding(encoding, text_path): # Import codecs so they register
"" import alphanumeric
import ascii7
import ascii6
def evaluate_encoding(encoding, filename):
print(f"Evaluating encoding {encoding}") print(f"Evaluating encoding {encoding}")
try:
codecs.lookup(encoding)
except LookupError:
__import__(encoding)
text = Path(text_path).read_text()
compressed_path = Path(text_path).with_suffix('.' + encoding)
if compressed_path.exists():
compressed = compressed_path.read_bytes()
else:
compressed = text.encode(encoding)
compressed_path.write_bytes(compressed)
original_size = len(text.encode('utf8'))
compressed_size = len(compressed)
return {
'encoding': encoding,
'original': original_size,
'compressed': compressed_size,
'compression_rate': compressed_size / original_size
}
def inspect_encoded_text(encoding, text_path): with open(filename, "r", encoding="utf8") as f:
try: text = f.read()
codecs.lookup(encoding)
except LookupError:
__import__(encoding)
compressed_path = Path(text_path).with_suffix('.' + encoding)
if not compressed_path.exists():
compressed = Path(text_path).read_text().encode(encoding)
compressed_path.write_bytes(compressed)
with open(compressed_path, 'rb') as fh:
text = fh.read().decode(encoding)
return text
text_dir = Path("texts") compressed = text.encode(encoding)
default_text = "texts/little_women.txt"
default_text_url = "https://www.gutenberg.org/cache/epub/37106/pg37106.txt" bits = len(compressed) * 8
chars = len(text)
return (encoding, bits, chars)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("encodings", nargs="+")
parser.add_argument("--text", default="hello.txt")
args = parser.parse_args()
parser = ArgumentParser("A command-line tool which measures compression rates.")
parser.add_argument("encodings", nargs="*")
parser.add_argument("--inspect", "-i", help="See resulting text for a codec")
parser.add_argument("--text", "-t", default="texts/little_women.txt", help="Text file to use for testing")
parser.add_argument("--clean", "-c", action="store_true", help="Remove all generated files")
args = parser.parse_args()
if args.clean:
shutil.rmtree(text_dir)
if not text_dir.exists():
text_dir.mkdir()
if not Path(args.text).exists():
if args.text == default_text:
print("Downloading 'little_women.txt' from Project Gutenberg...")
response = requests.get(default_text_url, stream=True)
with open(default_text, 'wb') as f:
f.write(response.content)
else:
raise ValueError(f"{args.text} does not exist")
sys.path.append(codecs_dir)
if args.encodings:
results = [evaluate_encoding(e, args.text) for e in args.encodings] results = [evaluate_encoding(e, args.text) for e in args.encodings]
print(tabulate(results, headers="keys"))
if args.inspect:
print(inspect_encoded_text(args.inspect, args.text))
print("\nResults:")
for enc, bits, chars in results:
print(f"{enc}: {bits} bits for {chars} characters")