diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..4a96c22 --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +source .venv/bin/activate \ No newline at end of file diff --git a/hello.txt b/hello.txt new file mode 100644 index 0000000..3462721 --- /dev/null +++ b/hello.txt @@ -0,0 +1 @@ +hello! \ No newline at end of file diff --git a/text_codecs/alphanumeric.py b/text_codecs/alphanumeric.py index e8cc2ea..fdec120 100644 --- a/text_codecs/alphanumeric.py +++ b/text_codecs/alphanumeric.py @@ -1,6 +1,5 @@ import string import codecs -from custom_codecs.register import register_codec from easybits import Bits allowed_characters = string.ascii_letters + string.digits @@ -36,4 +35,20 @@ def decode(data): breakpoint() return text +def register_codec(encode, decode, name): + """Registers a codec so that it can later be used to encode + or decode strings and bytes. + """ + def encode_wrapper(text): + return encode(text), len(text) + + def decode_wrapper(data): + return decode(data), len(data) + + def search_for_codec(query): + if query == name: + return codecs.CodecInfo(encode_wrapper, decode_wrapper, name=name) + + codecs.register(search_for_codec) + register_codec(encode, decode, "alphanumeric") diff --git a/text_codecs/ascii7.py b/text_codecs/ascii7.py index fb24cd0..350245d 100644 --- a/text_codecs/ascii7.py +++ b/text_codecs/ascii7.py @@ -1,6 +1,5 @@ -from custom_codecs.register import register_codec from easybits import Bits - +import codecs def encode(text): """An encoder which only handles ASCII: non-ASCII characters are replaced with '?'. Once all the characters are ASCII, this encoder @@ -30,4 +29,21 @@ def decode(data): text += Bits(byte).ascii return text -register_codec(encode, decode, "ascii7") + +def register_codec(encode, decode, name): + """Registers a codec so that it can later be used to encode + or decode strings and bytes. + """ + def encode_wrapper(text): + return encode(text), len(text) + + def decode_wrapper(data): + return decode(data), len(data) + + def search_for_codec(query): + if query == name: + return codecs.CodecInfo(encode_wrapper, decode_wrapper, name=name) + + codecs.register(search_for_codec) + +register_codec(encode, decode, "ascii7") \ No newline at end of file diff --git a/text_codecs/commonchars.py b/text_codecs/commonchars.py new file mode 100644 index 0000000..7abb4d4 --- /dev/null +++ b/text_codecs/commonchars.py @@ -0,0 +1,93 @@ +from easybits import Bits +import codecs + +def encode(text): + "An encoder which only handles ASCII: non-ASCII characters are replaced with '?'. " + "this will remove the most common characters in english texts [e, t, a, o, i, n, s, h] and by writing a 1 and followed by 3 bits" + "other charcaters will be written with stadard 8 bit code" + "this ecoding has about a 77 percent compression rate" + "there is some loss in quality since unknown characters are replaced with ? and not decoded" + + result = Bits() + commonchars = ['e', 't', 'a', 'o', 'i', 'n', 's', 'h'] + for char in text: + try: + b = Bits(char, encoding='ascii') + except UnicodeEncodeError: + b = Bits('?', encoding='ascii') + char = '?' + if char in commonchars: + result = result.concat(Bits('1')) + i = commonchars.index(char) + if i == 0: + result = result.concat(Bits('000')) + elif i == 1: + result = result.concat(Bits("001")) + elif i == 2: + result = result.concat(Bits("010")) + elif i == 3: + result = result.concat(Bits("011")) + elif i == 4: + result = result.concat(Bits("100")) + elif i == 5: + result = result.concat(Bits("101")) + elif i == 6: + result = result.concat(Bits("110")) + elif i == 7: + result = result.concat(Bits("111")) + else: + result = result.concat(b) + return result.bytes + +def decode(data): + "Matching decoder. Checks if first bit is 1 or 0. If it is 1, then it changes character out of encoded form, which is 3 bits long. " + "If not, it just converts it using standard ascii which is 8 bits." + bits = Bits(bytes(data)) + text = "" + commonchars = ['e', 't', 'a', 'o', 'i', 'n', 's', 'h'] + i = 0 + while i < len(bits): + if bits[i] == 1: + byte = bits[i+1:i+4] + if byte == Bits('000'): + text += commonchars[0] + elif byte == Bits('001'): + text += commonchars[1] + elif byte == Bits('010'): + text += commonchars[2] + elif byte == Bits('011'): + text += commonchars[3] + elif byte == Bits('100'): + text += commonchars[4] + elif byte == Bits('101'): + text += commonchars[5] + elif byte == Bits('110'): + text += commonchars[6] + elif byte == Bits('111'): + text += commonchars[7] + i += 4 + + else: + b = bits[i:i+8] + text += b.ascii + i += 8 + + return text + +def register_codec(encode, decode, name): + """Registers a codec so that it can later be used to encode + or decode strings and bytes. + """ + def encode_wrapper(text): + return encode(text), len(text) + + def decode_wrapper(data): + return decode(data), len(data) + + def search_for_codec(query): + if query == name: + return codecs.CodecInfo(encode_wrapper, decode_wrapper, name=name) + + codecs.register(search_for_codec) + +register_codec(encode, decode, "commonchars") \ No newline at end of file diff --git a/text_codecs/evaluate.py b/text_codecs/evaluate.py index 8aff393..9b79dbd 100644 --- a/text_codecs/evaluate.py +++ b/text_codecs/evaluate.py @@ -72,4 +72,20 @@ if args.encodings: print(tabulate(results, headers="keys")) if args.inspect: print(inspect_encoded_text(args.inspect, args.text)) + +def register_codec(encode, decode, name): + """Registers a codec so that it can later be used to encode + or decode strings and bytes. + """ + def encode_wrapper(text): + return encode(text), len(text) + def decode_wrapper(data): + return decode(data), len(data) + + def search_for_codec(query): + if query == name: + return codecs.CodecInfo(encode_wrapper, decode_wrapper, name=name) + + codecs.register(search_for_codec) + codecs.register(search_for_codec) \ No newline at end of file