from argparse import ArgumentParser from pathlib import Path import codecs import requests from tabulate import tabulate import sys import shutil codecs_dir = "text_codecs" def evaluate_encoding(encoding, text_path): "" print(f"Evaluating encoding {encoding}") try: codecs.lookup(encoding) except LookupError: __import__(encoding) text = Path(text_path).read_text() compressed_path = Path(text_path).with_suffix('.' + encoding) if compressed_path.exists(): compressed = compressed_path.read_bytes() else: compressed = text.encode(encoding) compressed_path.write_bytes(compressed) original_size = len(text.encode('utf8')) compressed_size = len(compressed) return { 'encoding': encoding, 'original': original_size, 'compressed': compressed_size, 'compression_rate': compressed_size / original_size } def inspect_encoded_text(encoding, text_path): try: codecs.lookup(encoding) except LookupError: __import__(encoding) compressed_path = Path(text_path).with_suffix('.' + encoding) if not compressed_path.exists(): compressed = Path(text_path).read_text().encode(encoding) compressed_path.write_bytes(compressed) with open(compressed_path, 'rb') as fh: text = fh.read().decode(encoding) return text text_dir = Path("texts") default_text = "texts/little_women.txt" default_text_url = "https://www.gutenberg.org/cache/epub/37106/pg37106.txt" parser = ArgumentParser("A command-line tool which measures compression rates.") parser.add_argument("encodings", nargs="*") parser.add_argument("--inspect", "-i", help="See resulting text for a codec") parser.add_argument("--text", "-t", default="texts/little_women.txt", help="Text file to use for testing") parser.add_argument("--clean", "-c", action="store_true", help="Remove all generated files") args = parser.parse_args() if args.clean: shutil.rmtree(text_dir) if not text_dir.exists(): text_dir.mkdir() if not Path(args.text).exists(): if args.text == default_text: print("Downloading 'little_women.txt' from Project Gutenberg...") response = requests.get(default_text_url, stream=True) with open(default_text, 'wb') as f: f.write(response.content) else: raise ValueError(f"{args.text} does not exist") sys.path.append(codecs_dir) if args.encodings: results = [evaluate_encoding(e, args.text) for e in args.encodings] print(tabulate(results, headers="keys")) if args.inspect: print(inspect_encoded_text(args.inspect, args.text))