Move eval script
This commit is contained in:
75
text_codecs/evaluate.py
Normal file
75
text_codecs/evaluate.py
Normal file
@@ -0,0 +1,75 @@
|
||||
from argparse import ArgumentParser
|
||||
from pathlib import Path
|
||||
import codecs
|
||||
import requests
|
||||
from tabulate import tabulate
|
||||
import sys
|
||||
import shutil
|
||||
|
||||
codecs_dir = "text_codecs"
|
||||
|
||||
def evaluate_encoding(encoding, text_path):
|
||||
""
|
||||
print(f"Evaluating encoding {encoding}")
|
||||
try:
|
||||
codecs.lookup(encoding)
|
||||
except LookupError:
|
||||
__import__(encoding)
|
||||
text = Path(text_path).read_text()
|
||||
compressed_path = Path(text_path).with_suffix('.' + encoding)
|
||||
if compressed_path.exists():
|
||||
compressed = compressed_path.read_bytes()
|
||||
else:
|
||||
compressed = text.encode(encoding)
|
||||
compressed_path.write_bytes(compressed)
|
||||
original_size = len(text.encode('utf8'))
|
||||
compressed_size = len(compressed)
|
||||
return {
|
||||
'encoding': encoding,
|
||||
'original': original_size,
|
||||
'compressed': compressed_size,
|
||||
'compression_rate': compressed_size / original_size
|
||||
}
|
||||
|
||||
def inspect_encoded_text(encoding, text_path):
|
||||
try:
|
||||
codecs.lookup(encoding)
|
||||
except LookupError:
|
||||
__import__(encoding)
|
||||
compressed_path = Path(text_path).with_suffix('.' + encoding)
|
||||
if not compressed_path.exists():
|
||||
compressed = Path(text_path).read_text().encode(encoding)
|
||||
compressed_path.write_bytes(compressed)
|
||||
with open(compressed_path, 'rb') as fh:
|
||||
text = fh.read().decode(encoding)
|
||||
return text
|
||||
|
||||
text_dir = Path("texts")
|
||||
default_text = "texts/little_women.txt"
|
||||
default_text_url = "https://www.gutenberg.org/cache/epub/37106/pg37106.txt"
|
||||
|
||||
parser = ArgumentParser("A command-line tool which measures compression rates.")
|
||||
parser.add_argument("encodings", nargs="*")
|
||||
parser.add_argument("--inspect", "-i", help="See resulting text for a codec")
|
||||
parser.add_argument("--text", "-t", default="texts/little_women.txt", help="Text file to use for testing")
|
||||
parser.add_argument("--clean", "-c", action="store_true", help="Remove all generated files")
|
||||
args = parser.parse_args()
|
||||
if args.clean:
|
||||
shutil.rmtree(text_dir)
|
||||
if not text_dir.exists():
|
||||
text_dir.mkdir()
|
||||
if not Path(args.text).exists():
|
||||
if args.text == default_text:
|
||||
print("Downloading 'little_women.txt' from Project Gutenberg...")
|
||||
response = requests.get(default_text_url, stream=True)
|
||||
with open(default_text, 'wb') as f:
|
||||
f.write(response.content)
|
||||
else:
|
||||
raise ValueError(f"{args.text} does not exist")
|
||||
sys.path.append(codecs_dir)
|
||||
if args.encodings:
|
||||
results = [evaluate_encoding(e, args.text) for e in args.encodings]
|
||||
print(tabulate(results, headers="keys"))
|
||||
if args.inspect:
|
||||
print(inspect_encoded_text(args.inspect, args.text))
|
||||
|
Reference in New Issue
Block a user