Initial commit

This commit is contained in:
2025-10-01 16:24:49 +00:00
commit 7a62fe5647
8 changed files with 856 additions and 0 deletions

View File

@@ -0,0 +1,39 @@
import string
import codecs
from custom_codecs.register import register_codec
from easybits import Bits
allowed_characters = string.ascii_letters + string.digits
def encode(text):
"""A (very) lossy encoder which only saves ASCII letters, numbers, and spaces.
Everything else is discarded. All whitespace (e.g. tabs) is converted into spaces.
"""
ascii_characters = []
last_character_was_space = False
for char in text:
if char in allowed_characters:
ascii_characters.append(char)
last_character_was_space = False
elif char in string.whitespace and not last_character_was_space:
ascii_characters.append(' ')
last_character_was_space = True
ascii_text = ''.join(ascii_characters)
return Bits(ascii_text).bytes
def decode(data):
"""A decoder which reads bytes and returns (string, length),
where length is the length of bytes consumed
"""
text = ""
for i, byte in enumerate(data):
try:
text += Bits(byte, length=8).ascii
except OverflowError:
print(i, byte)
print(text + '|')
print("Error")
breakpoint()
return text
register_codec(encode, decode, "alphanumeric")

33
text_codecs/ascii7.py Normal file
View File

@@ -0,0 +1,33 @@
from custom_codecs.register import register_codec
from easybits import Bits
def encode(text):
"""An encoder which only handles ASCII: non-ASCII characters
are replaced with '?'. Once all the characters are ASCII, this encoder
writes the first seven bits of each byte (the first bit of every
ASCII character is 0, so we can easily reconstruct the full byte
when we decode). Therefore, this encoder compresses ASCII text into
7/8 of the usual size.
"""
result = Bits()
for char in text:
try:
b = Bits(char, encoding='ascii')
except UnicodeEncodeError:
b = Bits('?')
result = result.concat(b[1:])
return result.bytes
def decode(data):
"""The matching decoder. Reads seven bits at a time, putting a 0 on
the front to create a full byte. Then converts this byte into its
ASCII representation.
"""
bits = Bits(bytes(data))
text = ""
for i in range(0, len(bits), 7):
byte = Bits('0').concat(bits[i:i+7])
text += Bits(byte).ascii
return text
register_codec(encode, decode, "ascii7")

75
text_codecs/evaluate.py Normal file
View File

@@ -0,0 +1,75 @@
from argparse import ArgumentParser
from pathlib import Path
import codecs
import requests
from tabulate import tabulate
import sys
import shutil
codecs_dir = "text_codecs"
def evaluate_encoding(encoding, text_path):
""
print(f"Evaluating encoding {encoding}")
try:
codecs.lookup(encoding)
except LookupError:
__import__(encoding)
text = Path(text_path).read_text()
compressed_path = Path(text_path).with_suffix('.' + encoding)
if compressed_path.exists():
compressed = compressed_path.read_bytes()
else:
compressed = text.encode(encoding)
compressed_path.write_bytes(compressed)
original_size = len(text.encode('utf8'))
compressed_size = len(compressed)
return {
'encoding': encoding,
'original': original_size,
'compressed': compressed_size,
'compression_rate': compressed_size / original_size
}
def inspect_encoded_text(encoding, text_path):
try:
codecs.lookup(encoding)
except LookupError:
__import__(encoding)
compressed_path = Path(text_path).with_suffix('.' + encoding)
if not compressed_path.exists():
compressed = Path(text_path).read_text().encode(encoding)
compressed_path.write_bytes(compressed)
with open(compressed_path, 'rb') as fh:
text = fh.read().decode(encoding)
return text
text_dir = Path("texts")
default_text = "texts/little_women.txt"
default_text_url = "https://www.gutenberg.org/cache/epub/37106/pg37106.txt"
parser = ArgumentParser("A command-line tool which measures compression rates.")
parser.add_argument("encodings", nargs="*")
parser.add_argument("--inspect", "-i", help="See resulting text for a codec")
parser.add_argument("--text", "-t", default="texts/little_women.txt", help="Text file to use for testing")
parser.add_argument("--clean", "-c", action="store_true", help="Remove all generated files")
args = parser.parse_args()
if args.clean:
shutil.rmtree(text_dir)
if not text_dir.exists():
text_dir.mkdir()
if not Path(args.text).exists():
if args.text == default_text:
print("Downloading 'little_women.txt' from Project Gutenberg...")
response = requests.get(default_text_url, stream=True)
with open(default_text, 'wb') as f:
f.write(response.content)
else:
raise ValueError(f"{args.text} does not exist")
sys.path.append(codecs_dir)
if args.encodings:
results = [evaluate_encoding(e, args.text) for e in args.encodings]
print(tabulate(results, headers="keys"))
if args.inspect:
print(inspect_encoded_text(args.inspect, args.text))

17
text_codecs/register.py Normal file
View File

@@ -0,0 +1,17 @@
import codecs
def register_codec(encode, decode, name):
"""Registers a codec so that it can later be used to encode
or decode strings and bytes.
"""
def encode_wrapper(text):
return encode(text), len(text)
def decode_wrapper(data):
return decode(data), len(data)
def search_for_codec(query):
if query == name:
return codecs.CodecInfo(encode_wrapper, decode_wrapper, name=name)
codecs.register(search_for_codec)