lab_compression/text_codecs/evaluate.py

from argparse import ArgumentParser
from pathlib import Path
import codecs
import requests
from tabulate import tabulate
import sys
import shutil

codecs_dir = "text_codecs"

def evaluate_encoding(encoding, text_path):
    ""
    print(f"Evaluating encoding {encoding}")
    try:
        codecs.lookup(encoding)
    except LookupError:
        __import__(encoding)
    text = Path(text_path).read_text()
    compressed_path = Path(text_path).with_suffix('.' + encoding)
    if compressed_path.exists():
        compressed = compressed_path.read_bytes()
    else:
        compressed = text.encode(encoding)
        compressed_path.write_bytes(compressed)
    original_size = len(text.encode('utf8'))
    compressed_size = len(compressed)
    return {
        'encoding': encoding,
        'original': original_size,
        'compressed': compressed_size,
        'compression_rate': compressed_size / original_size
    }

def inspect_encoded_text(encoding, text_path):
    try:
        codecs.lookup(encoding)
    except LookupError:
        __import__(encoding)
    compressed_path = Path(text_path).with_suffix('.' + encoding)
    if not compressed_path.exists():
        compressed = Path(text_path).read_text().encode(encoding)
        compressed_path.write_bytes(compressed)
    with open(compressed_path, 'rb') as fh:
        text = fh.read().decode(encoding)
        return text

text_dir = Path("texts")
default_text = "texts/little_women.txt"
default_text_url = "https://www.gutenberg.org/cache/epub/37106/pg37106.txt"

parser = ArgumentParser("A command-line tool which measures compression rates.")
parser.add_argument("encodings", nargs="*")
parser.add_argument("--inspect", "-i", help="See resulting text for a codec")
parser.add_argument("--text", "-t", default="texts/little_women.txt", help="Text file to use for testing")
parser.add_argument("--clean", "-c", action="store_true", help="Remove all generated files")
args = parser.parse_args()
if args.clean:
    shutil.rmtree(text_dir)
if not text_dir.exists():
    text_dir.mkdir()
if not Path(args.text).exists():
    if args.text == default_text:
        print("Downloading 'little_women.txt' from Project Gutenberg...")
        response = requests.get(default_text_url, stream=True)
        with open(default_text, 'wb') as f:
            f.write(response.content)
    else:
        raise ValueError(f"{args.text} does not exist")
sys.path.append(codecs_dir)
if args.encodings:
    results = [evaluate_encoding(e, args.text) for e in args.encodings]
    print(tabulate(results, headers="keys"))
if args.inspect:
    print(inspect_encoded_text(args.inspect, args.text))