this was hard, lol

This commit is contained in:
mdecker62
2026-03-12 10:49:25 -04:00
parent 49d5936018
commit ab135e9b55
7 changed files with 114 additions and 124 deletions

View File

@@ -1,75 +1,40 @@
from argparse import ArgumentParser
from pathlib import Path
import argparse
import codecs
import requests
from tabulate import tabulate
import sys
import shutil
import os
codecs_dir = "text_codecs"
# Make sure Python can see the codec modules
sys.path.append(os.path.dirname(__file__))
def evaluate_encoding(encoding, text_path):
""
# Import codecs so they register
import alphanumeric
import ascii7
import ascii6
def evaluate_encoding(encoding, filename):
print(f"Evaluating encoding {encoding}")
try:
codecs.lookup(encoding)
except LookupError:
__import__(encoding)
text = Path(text_path).read_text()
compressed_path = Path(text_path).with_suffix('.' + encoding)
if compressed_path.exists():
compressed = compressed_path.read_bytes()
else:
compressed = text.encode(encoding)
compressed_path.write_bytes(compressed)
original_size = len(text.encode('utf8'))
compressed_size = len(compressed)
return {
'encoding': encoding,
'original': original_size,
'compressed': compressed_size,
'compression_rate': compressed_size / original_size
}
def inspect_encoded_text(encoding, text_path):
try:
codecs.lookup(encoding)
except LookupError:
__import__(encoding)
compressed_path = Path(text_path).with_suffix('.' + encoding)
if not compressed_path.exists():
compressed = Path(text_path).read_text().encode(encoding)
compressed_path.write_bytes(compressed)
with open(compressed_path, 'rb') as fh:
text = fh.read().decode(encoding)
return text
with open(filename, "r", encoding="utf8") as f:
text = f.read()
text_dir = Path("texts")
default_text = "texts/little_women.txt"
default_text_url = "https://www.gutenberg.org/cache/epub/37106/pg37106.txt"
compressed = text.encode(encoding)
bits = len(compressed) * 8
chars = len(text)
return (encoding, bits, chars)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("encodings", nargs="+")
parser.add_argument("--text", default="hello.txt")
args = parser.parse_args()
parser = ArgumentParser("A command-line tool which measures compression rates.")
parser.add_argument("encodings", nargs="*")
parser.add_argument("--inspect", "-i", help="See resulting text for a codec")
parser.add_argument("--text", "-t", default="texts/little_women.txt", help="Text file to use for testing")
parser.add_argument("--clean", "-c", action="store_true", help="Remove all generated files")
args = parser.parse_args()
if args.clean:
shutil.rmtree(text_dir)
if not text_dir.exists():
text_dir.mkdir()
if not Path(args.text).exists():
if args.text == default_text:
print("Downloading 'little_women.txt' from Project Gutenberg...")
response = requests.get(default_text_url, stream=True)
with open(default_text, 'wb') as f:
f.write(response.content)
else:
raise ValueError(f"{args.text} does not exist")
sys.path.append(codecs_dir)
if args.encodings:
results = [evaluate_encoding(e, args.text) for e in args.encodings]
print(tabulate(results, headers="keys"))
if args.inspect:
print(inspect_encoded_text(args.inspect, args.text))
print("\nResults:")
for enc, bits, chars in results:
print(f"{enc}: {bits} bits for {chars} characters")