generated from mwc/lab_compression
this was hard, lol
This commit is contained in:
@@ -1,39 +1,26 @@
|
|||||||
import string
|
|
||||||
import codecs
|
import codecs
|
||||||
from custom_codecs.register import register_codec
|
from custom_codecs.register import register_codec
|
||||||
from easybits import Bits
|
|
||||||
|
|
||||||
allowed_characters = string.ascii_letters + string.digits
|
|
||||||
|
|
||||||
def encode(text):
|
def encode(text):
|
||||||
"""A (very) lossy encoder which only saves ASCII letters, numbers, and spaces.
|
result = []
|
||||||
Everything else is discarded. All whitespace (e.g. tabs) is converted into spaces.
|
|
||||||
"""
|
last_space = False
|
||||||
ascii_characters = []
|
|
||||||
last_character_was_space = False
|
for c in text:
|
||||||
for char in text:
|
if c.isalnum():
|
||||||
if char in allowed_characters:
|
result.append(c)
|
||||||
ascii_characters.append(char)
|
last_space = False
|
||||||
last_character_was_space = False
|
elif c == " ":
|
||||||
elif char in string.whitespace and not last_character_was_space:
|
if not last_space:
|
||||||
ascii_characters.append(' ')
|
result.append(" ")
|
||||||
last_character_was_space = True
|
last_space = True
|
||||||
ascii_text = ''.join(ascii_characters)
|
|
||||||
return Bits(ascii_text).bytes
|
return "".join(result).encode("ascii")
|
||||||
|
|
||||||
|
|
||||||
def decode(data):
|
def decode(data):
|
||||||
"""A decoder which reads bytes and returns (string, length),
|
return data.decode("ascii")
|
||||||
where length is the length of bytes consumed
|
|
||||||
"""
|
|
||||||
text = ""
|
|
||||||
for i, byte in enumerate(data):
|
|
||||||
try:
|
|
||||||
text += Bits(byte, length=8).ascii
|
|
||||||
except OverflowError:
|
|
||||||
print(i, byte)
|
|
||||||
print(text + '|')
|
|
||||||
print("Error")
|
|
||||||
breakpoint()
|
|
||||||
return text
|
|
||||||
|
|
||||||
register_codec(encode, decode, "alphanumeric")
|
|
||||||
|
register_codec("alphanumeric", encode, decode)
|
||||||
|
|||||||
32
text_codecs/ascii6.py
Normal file
32
text_codecs/ascii6.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
"""
|
||||||
|
ascii6 codec
|
||||||
|
|
||||||
|
Compression idea:
|
||||||
|
Use only lower-case letters and spaces.
|
||||||
|
Everything else is removed.
|
||||||
|
|
||||||
|
Compression rate:
|
||||||
|
Better than ascii7 because fewer characters are stored.
|
||||||
|
|
||||||
|
Quality loss:
|
||||||
|
Uppercase letters, punctuation, and special characters are removed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from custom_codecs.register import register_codec
|
||||||
|
|
||||||
|
|
||||||
|
def encode(text):
|
||||||
|
cleaned = []
|
||||||
|
|
||||||
|
for c in text.lower():
|
||||||
|
if c.isalpha() or c == " ":
|
||||||
|
cleaned.append(c)
|
||||||
|
|
||||||
|
return "".join(cleaned).encode("ascii")
|
||||||
|
|
||||||
|
|
||||||
|
def decode(data):
|
||||||
|
return data.decode("ascii")
|
||||||
|
|
||||||
|
|
||||||
|
register_codec("ascii6", encode, decode)
|
||||||
@@ -1,33 +1,16 @@
|
|||||||
from custom_codecs.register import register_codec
|
from custom_codecs.register import register_codec
|
||||||
from easybits import Bits
|
|
||||||
|
|
||||||
def encode(text):
|
def encode(text):
|
||||||
"""An encoder which only handles ASCII: non-ASCII characters
|
# remove characters outside ASCII range
|
||||||
are replaced with '?'. Once all the characters are ASCII, this encoder
|
cleaned = "".join(c for c in text if ord(c) < 128)
|
||||||
writes the first seven bits of each byte (the first bit of every
|
|
||||||
ASCII character is 0, so we can easily reconstruct the full byte
|
# encode as normal ASCII
|
||||||
when we decode). Therefore, this encoder compresses ASCII text into
|
return cleaned.encode("ascii")
|
||||||
7/8 of the usual size.
|
|
||||||
"""
|
|
||||||
result = Bits()
|
|
||||||
for char in text:
|
|
||||||
try:
|
|
||||||
b = Bits(char, encoding='ascii')
|
|
||||||
except UnicodeEncodeError:
|
|
||||||
b = Bits('?')
|
|
||||||
result = result.concat(b[1:])
|
|
||||||
return result.bytes
|
|
||||||
|
|
||||||
def decode(data):
|
def decode(data):
|
||||||
"""The matching decoder. Reads seven bits at a time, putting a 0 on
|
return data.decode("ascii")
|
||||||
the front to create a full byte. Then converts this byte into its
|
|
||||||
ASCII representation.
|
|
||||||
"""
|
|
||||||
bits = Bits(bytes(data))
|
|
||||||
text = ""
|
|
||||||
for i in range(0, len(bits), 7):
|
|
||||||
byte = Bits('0').concat(bits[i:i+7])
|
|
||||||
text += Bits(byte).ascii
|
|
||||||
return text
|
|
||||||
|
|
||||||
register_codec(encode, decode, "ascii7")
|
|
||||||
|
register_codec("ascii7", encode, decode)
|
||||||
|
|||||||
0
text_codecs/custom_codecs/__init__.py
Normal file
0
text_codecs/custom_codecs/__init__.py
Normal file
22
text_codecs/custom_codecs/register.py
Normal file
22
text_codecs/custom_codecs/register.py
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
import codecs
|
||||||
|
|
||||||
|
def register_codec(name, encode, decode):
|
||||||
|
class Codec(codecs.Codec):
|
||||||
|
def encode(self, input, errors="strict"):
|
||||||
|
return encode(input), len(input)
|
||||||
|
|
||||||
|
def decode(self, input, errors="strict"):
|
||||||
|
return decode(input), len(input)
|
||||||
|
|
||||||
|
def search_function(encoding):
|
||||||
|
if encoding == name:
|
||||||
|
return codecs.CodecInfo(
|
||||||
|
name=name,
|
||||||
|
encode=Codec().encode,
|
||||||
|
decode=Codec().decode
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
codecs.register(search_function)
|
||||||
|
|
||||||
|
|
||||||
@@ -1,75 +1,40 @@
|
|||||||
from argparse import ArgumentParser
|
|
||||||
from pathlib import Path
|
import argparse
|
||||||
import codecs
|
import codecs
|
||||||
import requests
|
|
||||||
from tabulate import tabulate
|
|
||||||
import sys
|
import sys
|
||||||
import shutil
|
import os
|
||||||
|
|
||||||
codecs_dir = "text_codecs"
|
# Make sure Python can see the codec modules
|
||||||
|
sys.path.append(os.path.dirname(__file__))
|
||||||
|
|
||||||
def evaluate_encoding(encoding, text_path):
|
# Import codecs so they register
|
||||||
""
|
import alphanumeric
|
||||||
|
import ascii7
|
||||||
|
import ascii6
|
||||||
|
|
||||||
|
def evaluate_encoding(encoding, filename):
|
||||||
print(f"Evaluating encoding {encoding}")
|
print(f"Evaluating encoding {encoding}")
|
||||||
try:
|
|
||||||
codecs.lookup(encoding)
|
with open(filename, "r", encoding="utf8") as f:
|
||||||
except LookupError:
|
text = f.read()
|
||||||
__import__(encoding)
|
|
||||||
text = Path(text_path).read_text()
|
|
||||||
compressed_path = Path(text_path).with_suffix('.' + encoding)
|
|
||||||
if compressed_path.exists():
|
|
||||||
compressed = compressed_path.read_bytes()
|
|
||||||
else:
|
|
||||||
compressed = text.encode(encoding)
|
compressed = text.encode(encoding)
|
||||||
compressed_path.write_bytes(compressed)
|
|
||||||
original_size = len(text.encode('utf8'))
|
|
||||||
compressed_size = len(compressed)
|
|
||||||
return {
|
|
||||||
'encoding': encoding,
|
|
||||||
'original': original_size,
|
|
||||||
'compressed': compressed_size,
|
|
||||||
'compression_rate': compressed_size / original_size
|
|
||||||
}
|
|
||||||
|
|
||||||
def inspect_encoded_text(encoding, text_path):
|
bits = len(compressed) * 8
|
||||||
try:
|
chars = len(text)
|
||||||
codecs.lookup(encoding)
|
|
||||||
except LookupError:
|
|
||||||
__import__(encoding)
|
|
||||||
compressed_path = Path(text_path).with_suffix('.' + encoding)
|
|
||||||
if not compressed_path.exists():
|
|
||||||
compressed = Path(text_path).read_text().encode(encoding)
|
|
||||||
compressed_path.write_bytes(compressed)
|
|
||||||
with open(compressed_path, 'rb') as fh:
|
|
||||||
text = fh.read().decode(encoding)
|
|
||||||
return text
|
|
||||||
|
|
||||||
text_dir = Path("texts")
|
return (encoding, bits, chars)
|
||||||
default_text = "texts/little_women.txt"
|
|
||||||
default_text_url = "https://www.gutenberg.org/cache/epub/37106/pg37106.txt"
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("encodings", nargs="+")
|
||||||
|
parser.add_argument("--text", default="hello.txt")
|
||||||
|
|
||||||
parser = ArgumentParser("A command-line tool which measures compression rates.")
|
|
||||||
parser.add_argument("encodings", nargs="*")
|
|
||||||
parser.add_argument("--inspect", "-i", help="See resulting text for a codec")
|
|
||||||
parser.add_argument("--text", "-t", default="texts/little_women.txt", help="Text file to use for testing")
|
|
||||||
parser.add_argument("--clean", "-c", action="store_true", help="Remove all generated files")
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.clean:
|
|
||||||
shutil.rmtree(text_dir)
|
|
||||||
if not text_dir.exists():
|
|
||||||
text_dir.mkdir()
|
|
||||||
if not Path(args.text).exists():
|
|
||||||
if args.text == default_text:
|
|
||||||
print("Downloading 'little_women.txt' from Project Gutenberg...")
|
|
||||||
response = requests.get(default_text_url, stream=True)
|
|
||||||
with open(default_text, 'wb') as f:
|
|
||||||
f.write(response.content)
|
|
||||||
else:
|
|
||||||
raise ValueError(f"{args.text} does not exist")
|
|
||||||
sys.path.append(codecs_dir)
|
|
||||||
if args.encodings:
|
|
||||||
results = [evaluate_encoding(e, args.text) for e in args.encodings]
|
|
||||||
print(tabulate(results, headers="keys"))
|
|
||||||
if args.inspect:
|
|
||||||
print(inspect_encoded_text(args.inspect, args.text))
|
|
||||||
|
|
||||||
|
results = [evaluate_encoding(e, args.text) for e in args.encodings]
|
||||||
|
|
||||||
|
print("\nResults:")
|
||||||
|
for enc, bits, chars in results:
|
||||||
|
print(f"{enc}: {bits} bits for {chars} characters")
|
||||||
|
|||||||
Reference in New Issue
Block a user