this was hard, lol

This commit is contained in:
mdecker62
2026-03-12 10:49:25 -04:00
parent 49d5936018
commit ab135e9b55
7 changed files with 114 additions and 124 deletions

1
hello.txt Normal file
View File

@@ -0,0 +1 @@
Hello!

View File

@@ -1,39 +1,26 @@
import string
import codecs
from custom_codecs.register import register_codec
from easybits import Bits
allowed_characters = string.ascii_letters + string.digits
def encode(text):
"""A (very) lossy encoder which only saves ASCII letters, numbers, and spaces.
Everything else is discarded. All whitespace (e.g. tabs) is converted into spaces.
"""
ascii_characters = []
last_character_was_space = False
for char in text:
if char in allowed_characters:
ascii_characters.append(char)
last_character_was_space = False
elif char in string.whitespace and not last_character_was_space:
ascii_characters.append(' ')
last_character_was_space = True
ascii_text = ''.join(ascii_characters)
return Bits(ascii_text).bytes
result = []
last_space = False
for c in text:
if c.isalnum():
result.append(c)
last_space = False
elif c == " ":
if not last_space:
result.append(" ")
last_space = True
return "".join(result).encode("ascii")
def decode(data):
"""A decoder which reads bytes and returns (string, length),
where length is the length of bytes consumed
"""
text = ""
for i, byte in enumerate(data):
try:
text += Bits(byte, length=8).ascii
except OverflowError:
print(i, byte)
print(text + '|')
print("Error")
breakpoint()
return text
return data.decode("ascii")
register_codec(encode, decode, "alphanumeric")
register_codec("alphanumeric", encode, decode)

32
text_codecs/ascii6.py Normal file
View File

@@ -0,0 +1,32 @@
"""
ascii6 codec
Compression idea:
Use only lower-case letters and spaces.
Everything else is removed.
Compression rate:
Better than ascii7 because fewer characters are stored.
Quality loss:
Uppercase letters, punctuation, and special characters are removed.
"""
from custom_codecs.register import register_codec
def encode(text):
cleaned = []
for c in text.lower():
if c.isalpha() or c == " ":
cleaned.append(c)
return "".join(cleaned).encode("ascii")
def decode(data):
return data.decode("ascii")
register_codec("ascii6", encode, decode)

View File

@@ -1,33 +1,16 @@
from custom_codecs.register import register_codec
from easybits import Bits
def encode(text):
"""An encoder which only handles ASCII: non-ASCII characters
are replaced with '?'. Once all the characters are ASCII, this encoder
writes the first seven bits of each byte (the first bit of every
ASCII character is 0, so we can easily reconstruct the full byte
when we decode). Therefore, this encoder compresses ASCII text into
7/8 of the usual size.
"""
result = Bits()
for char in text:
try:
b = Bits(char, encoding='ascii')
except UnicodeEncodeError:
b = Bits('?')
result = result.concat(b[1:])
return result.bytes
# remove characters outside ASCII range
cleaned = "".join(c for c in text if ord(c) < 128)
# encode as normal ASCII
return cleaned.encode("ascii")
def decode(data):
"""The matching decoder. Reads seven bits at a time, putting a 0 on
the front to create a full byte. Then converts this byte into its
ASCII representation.
"""
bits = Bits(bytes(data))
text = ""
for i in range(0, len(bits), 7):
byte = Bits('0').concat(bits[i:i+7])
text += Bits(byte).ascii
return text
return data.decode("ascii")
register_codec(encode, decode, "ascii7")
register_codec("ascii7", encode, decode)

View File

View File

@@ -0,0 +1,22 @@
import codecs
def register_codec(name, encode, decode):
class Codec(codecs.Codec):
def encode(self, input, errors="strict"):
return encode(input), len(input)
def decode(self, input, errors="strict"):
return decode(input), len(input)
def search_function(encoding):
if encoding == name:
return codecs.CodecInfo(
name=name,
encode=Codec().encode,
decode=Codec().decode
)
return None
codecs.register(search_function)

View File

@@ -1,75 +1,40 @@
from argparse import ArgumentParser
from pathlib import Path
import argparse
import codecs
import requests
from tabulate import tabulate
import sys
import shutil
import os
codecs_dir = "text_codecs"
# Make sure Python can see the codec modules
sys.path.append(os.path.dirname(__file__))
def evaluate_encoding(encoding, text_path):
""
# Import codecs so they register
import alphanumeric
import ascii7
import ascii6
def evaluate_encoding(encoding, filename):
print(f"Evaluating encoding {encoding}")
try:
codecs.lookup(encoding)
except LookupError:
__import__(encoding)
text = Path(text_path).read_text()
compressed_path = Path(text_path).with_suffix('.' + encoding)
if compressed_path.exists():
compressed = compressed_path.read_bytes()
else:
with open(filename, "r", encoding="utf8") as f:
text = f.read()
compressed = text.encode(encoding)
compressed_path.write_bytes(compressed)
original_size = len(text.encode('utf8'))
compressed_size = len(compressed)
return {
'encoding': encoding,
'original': original_size,
'compressed': compressed_size,
'compression_rate': compressed_size / original_size
}
def inspect_encoded_text(encoding, text_path):
try:
codecs.lookup(encoding)
except LookupError:
__import__(encoding)
compressed_path = Path(text_path).with_suffix('.' + encoding)
if not compressed_path.exists():
compressed = Path(text_path).read_text().encode(encoding)
compressed_path.write_bytes(compressed)
with open(compressed_path, 'rb') as fh:
text = fh.read().decode(encoding)
return text
bits = len(compressed) * 8
chars = len(text)
text_dir = Path("texts")
default_text = "texts/little_women.txt"
default_text_url = "https://www.gutenberg.org/cache/epub/37106/pg37106.txt"
return (encoding, bits, chars)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("encodings", nargs="+")
parser.add_argument("--text", default="hello.txt")
parser = ArgumentParser("A command-line tool which measures compression rates.")
parser.add_argument("encodings", nargs="*")
parser.add_argument("--inspect", "-i", help="See resulting text for a codec")
parser.add_argument("--text", "-t", default="texts/little_women.txt", help="Text file to use for testing")
parser.add_argument("--clean", "-c", action="store_true", help="Remove all generated files")
args = parser.parse_args()
if args.clean:
shutil.rmtree(text_dir)
if not text_dir.exists():
text_dir.mkdir()
if not Path(args.text).exists():
if args.text == default_text:
print("Downloading 'little_women.txt' from Project Gutenberg...")
response = requests.get(default_text_url, stream=True)
with open(default_text, 'wb') as f:
f.write(response.content)
else:
raise ValueError(f"{args.text} does not exist")
sys.path.append(codecs_dir)
if args.encodings:
results = [evaluate_encoding(e, args.text) for e in args.encodings]
print(tabulate(results, headers="keys"))
if args.inspect:
print(inspect_encoded_text(args.inspect, args.text))
results = [evaluate_encoding(e, args.text) for e in args.encodings]
print("\nResults:")
for enc, bits, chars in results:
print(f"{enc}: {bits} bits for {chars} characters")