generated from mwc/lab_compression
	
		
			
				
	
	
		
			76 lines
		
	
	
		
			2.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			76 lines
		
	
	
		
			2.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from argparse import ArgumentParser
 | 
						|
from pathlib import Path
 | 
						|
import codecs
 | 
						|
import requests
 | 
						|
from tabulate import tabulate
 | 
						|
import sys
 | 
						|
import shutil
 | 
						|
 | 
						|
codecs_dir = "text_codecs"
 | 
						|
 | 
						|
def evaluate_encoding(encoding, text_path):
 | 
						|
    ""
 | 
						|
    print(f"Evaluating encoding {encoding}")
 | 
						|
    try:
 | 
						|
        codecs.lookup(encoding)
 | 
						|
    except LookupError:
 | 
						|
        __import__(encoding)
 | 
						|
    text = Path(text_path).read_text()
 | 
						|
    compressed_path = Path(text_path).with_suffix('.' + encoding)
 | 
						|
    if compressed_path.exists():
 | 
						|
        compressed = compressed_path.read_bytes()
 | 
						|
    else:
 | 
						|
        compressed = text.encode(encoding)
 | 
						|
        compressed_path.write_bytes(compressed)
 | 
						|
    original_size = len(text.encode('utf8'))
 | 
						|
    compressed_size = len(compressed)
 | 
						|
    return {
 | 
						|
        'encoding': encoding, 
 | 
						|
        'original': original_size,
 | 
						|
        'compressed': compressed_size,
 | 
						|
        'compression_rate': compressed_size / original_size
 | 
						|
    }
 | 
						|
 | 
						|
def inspect_encoded_text(encoding, text_path):
 | 
						|
    try:
 | 
						|
        codecs.lookup(encoding)
 | 
						|
    except LookupError:
 | 
						|
        __import__(encoding)
 | 
						|
    compressed_path = Path(text_path).with_suffix('.' + encoding)
 | 
						|
    if not compressed_path.exists():
 | 
						|
        compressed = Path(text_path).read_text().encode(encoding)
 | 
						|
        compressed_path.write_bytes(compressed)
 | 
						|
    with open(compressed_path, 'rb') as fh:
 | 
						|
        text = fh.read().decode(encoding)
 | 
						|
        return text
 | 
						|
 | 
						|
text_dir = Path("texts")
 | 
						|
default_text = "texts/little_women.txt"
 | 
						|
default_text_url = "https://www.gutenberg.org/cache/epub/37106/pg37106.txt"
 | 
						|
 | 
						|
parser = ArgumentParser("A command-line tool which measures compression rates.")
 | 
						|
parser.add_argument("encodings", nargs="*")
 | 
						|
parser.add_argument("--inspect", "-i", help="See resulting text for a codec")
 | 
						|
parser.add_argument("--text", "-t", default="texts/little_women.txt", help="Text file to use for testing")
 | 
						|
parser.add_argument("--clean", "-c", action="store_true", help="Remove all generated files")
 | 
						|
args = parser.parse_args()
 | 
						|
if args.clean:
 | 
						|
    shutil.rmtree(text_dir)
 | 
						|
if not text_dir.exists():
 | 
						|
    text_dir.mkdir()
 | 
						|
if not Path(args.text).exists():
 | 
						|
    if args.text == default_text:
 | 
						|
        print("Downloading 'little_women.txt' from Project Gutenberg...")
 | 
						|
        response = requests.get(default_text_url, stream=True)
 | 
						|
        with open(default_text, 'wb') as f:
 | 
						|
            f.write(response.content)
 | 
						|
    else:
 | 
						|
        raise ValueError(f"{args.text} does not exist")
 | 
						|
sys.path.append(codecs_dir)
 | 
						|
if args.encodings:
 | 
						|
    results = [evaluate_encoding(e, args.text) for e in args.encodings]
 | 
						|
    print(tabulate(results, headers="keys"))
 | 
						|
if args.inspect:
 | 
						|
    print(inspect_encoded_text(args.inspect, args.text))
 | 
						|
 |