generated from mwc/lab_compression
implemented ecoding/decoding in commonchars.py
This commit is contained in:
@@ -1,6 +1,5 @@
|
|||||||
import string
|
import string
|
||||||
import codecs
|
import codecs
|
||||||
from custom_codecs.register import register_codec
|
|
||||||
from easybits import Bits
|
from easybits import Bits
|
||||||
|
|
||||||
allowed_characters = string.ascii_letters + string.digits
|
allowed_characters = string.ascii_letters + string.digits
|
||||||
@@ -36,4 +35,20 @@ def decode(data):
|
|||||||
breakpoint()
|
breakpoint()
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
def register_codec(encode, decode, name):
|
||||||
|
"""Registers a codec so that it can later be used to encode
|
||||||
|
or decode strings and bytes.
|
||||||
|
"""
|
||||||
|
def encode_wrapper(text):
|
||||||
|
return encode(text), len(text)
|
||||||
|
|
||||||
|
def decode_wrapper(data):
|
||||||
|
return decode(data), len(data)
|
||||||
|
|
||||||
|
def search_for_codec(query):
|
||||||
|
if query == name:
|
||||||
|
return codecs.CodecInfo(encode_wrapper, decode_wrapper, name=name)
|
||||||
|
|
||||||
|
codecs.register(search_for_codec)
|
||||||
|
|
||||||
register_codec(encode, decode, "alphanumeric")
|
register_codec(encode, decode, "alphanumeric")
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
from custom_codecs.register import register_codec
|
|
||||||
from easybits import Bits
|
from easybits import Bits
|
||||||
|
import codecs
|
||||||
def encode(text):
|
def encode(text):
|
||||||
"""An encoder which only handles ASCII: non-ASCII characters
|
"""An encoder which only handles ASCII: non-ASCII characters
|
||||||
are replaced with '?'. Once all the characters are ASCII, this encoder
|
are replaced with '?'. Once all the characters are ASCII, this encoder
|
||||||
@@ -30,4 +29,21 @@ def decode(data):
|
|||||||
text += Bits(byte).ascii
|
text += Bits(byte).ascii
|
||||||
return text
|
return text
|
||||||
|
|
||||||
register_codec(encode, decode, "ascii7")
|
|
||||||
|
def register_codec(encode, decode, name):
|
||||||
|
"""Registers a codec so that it can later be used to encode
|
||||||
|
or decode strings and bytes.
|
||||||
|
"""
|
||||||
|
def encode_wrapper(text):
|
||||||
|
return encode(text), len(text)
|
||||||
|
|
||||||
|
def decode_wrapper(data):
|
||||||
|
return decode(data), len(data)
|
||||||
|
|
||||||
|
def search_for_codec(query):
|
||||||
|
if query == name:
|
||||||
|
return codecs.CodecInfo(encode_wrapper, decode_wrapper, name=name)
|
||||||
|
|
||||||
|
codecs.register(search_for_codec)
|
||||||
|
|
||||||
|
register_codec(encode, decode, "ascii7")
|
||||||
93
text_codecs/commonchars.py
Normal file
93
text_codecs/commonchars.py
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
from easybits import Bits
|
||||||
|
import codecs
|
||||||
|
|
||||||
|
def encode(text):
|
||||||
|
"An encoder which only handles ASCII: non-ASCII characters are replaced with '?'. "
|
||||||
|
"this will remove the most common characters in english texts [e, t, a, o, i, n, s, h] and by writing a 1 and followed by 3 bits"
|
||||||
|
"other charcaters will be written with stadard 8 bit code"
|
||||||
|
"this ecoding has about a 77 percent compression rate"
|
||||||
|
"there is some loss in quality since unknown characters are replaced with ? and not decoded"
|
||||||
|
|
||||||
|
result = Bits()
|
||||||
|
commonchars = ['e', 't', 'a', 'o', 'i', 'n', 's', 'h']
|
||||||
|
for char in text:
|
||||||
|
try:
|
||||||
|
b = Bits(char, encoding='ascii')
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
b = Bits('?', encoding='ascii')
|
||||||
|
char = '?'
|
||||||
|
if char in commonchars:
|
||||||
|
result = result.concat(Bits('1'))
|
||||||
|
i = commonchars.index(char)
|
||||||
|
if i == 0:
|
||||||
|
result = result.concat(Bits('000'))
|
||||||
|
elif i == 1:
|
||||||
|
result = result.concat(Bits("001"))
|
||||||
|
elif i == 2:
|
||||||
|
result = result.concat(Bits("010"))
|
||||||
|
elif i == 3:
|
||||||
|
result = result.concat(Bits("011"))
|
||||||
|
elif i == 4:
|
||||||
|
result = result.concat(Bits("100"))
|
||||||
|
elif i == 5:
|
||||||
|
result = result.concat(Bits("101"))
|
||||||
|
elif i == 6:
|
||||||
|
result = result.concat(Bits("110"))
|
||||||
|
elif i == 7:
|
||||||
|
result = result.concat(Bits("111"))
|
||||||
|
else:
|
||||||
|
result = result.concat(b)
|
||||||
|
return result.bytes
|
||||||
|
|
||||||
|
def decode(data):
|
||||||
|
"Matching decoder. Checks if first bit is 1 or 0. If it is 1, then it changes character out of encoded form, which is 3 bits long. "
|
||||||
|
"If not, it just converts it using standard ascii which is 8 bits."
|
||||||
|
bits = Bits(bytes(data))
|
||||||
|
text = ""
|
||||||
|
commonchars = ['e', 't', 'a', 'o', 'i', 'n', 's', 'h']
|
||||||
|
i = 0
|
||||||
|
while i < len(bits):
|
||||||
|
if bits[i] == 1:
|
||||||
|
byte = bits[i+1:i+4]
|
||||||
|
if byte == Bits('000'):
|
||||||
|
text += commonchars[0]
|
||||||
|
elif byte == Bits('001'):
|
||||||
|
text += commonchars[1]
|
||||||
|
elif byte == Bits('010'):
|
||||||
|
text += commonchars[2]
|
||||||
|
elif byte == Bits('011'):
|
||||||
|
text += commonchars[3]
|
||||||
|
elif byte == Bits('100'):
|
||||||
|
text += commonchars[4]
|
||||||
|
elif byte == Bits('101'):
|
||||||
|
text += commonchars[5]
|
||||||
|
elif byte == Bits('110'):
|
||||||
|
text += commonchars[6]
|
||||||
|
elif byte == Bits('111'):
|
||||||
|
text += commonchars[7]
|
||||||
|
i += 4
|
||||||
|
|
||||||
|
else:
|
||||||
|
b = bits[i:i+8]
|
||||||
|
text += b.ascii
|
||||||
|
i += 8
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
def register_codec(encode, decode, name):
|
||||||
|
"""Registers a codec so that it can later be used to encode
|
||||||
|
or decode strings and bytes.
|
||||||
|
"""
|
||||||
|
def encode_wrapper(text):
|
||||||
|
return encode(text), len(text)
|
||||||
|
|
||||||
|
def decode_wrapper(data):
|
||||||
|
return decode(data), len(data)
|
||||||
|
|
||||||
|
def search_for_codec(query):
|
||||||
|
if query == name:
|
||||||
|
return codecs.CodecInfo(encode_wrapper, decode_wrapper, name=name)
|
||||||
|
|
||||||
|
codecs.register(search_for_codec)
|
||||||
|
|
||||||
|
register_codec(encode, decode, "commonchars")
|
||||||
@@ -72,4 +72,20 @@ if args.encodings:
|
|||||||
print(tabulate(results, headers="keys"))
|
print(tabulate(results, headers="keys"))
|
||||||
if args.inspect:
|
if args.inspect:
|
||||||
print(inspect_encoded_text(args.inspect, args.text))
|
print(inspect_encoded_text(args.inspect, args.text))
|
||||||
|
|
||||||
|
def register_codec(encode, decode, name):
|
||||||
|
"""Registers a codec so that it can later be used to encode
|
||||||
|
or decode strings and bytes.
|
||||||
|
"""
|
||||||
|
def encode_wrapper(text):
|
||||||
|
return encode(text), len(text)
|
||||||
|
|
||||||
|
def decode_wrapper(data):
|
||||||
|
return decode(data), len(data)
|
||||||
|
|
||||||
|
def search_for_codec(query):
|
||||||
|
if query == name:
|
||||||
|
return codecs.CodecInfo(encode_wrapper, decode_wrapper, name=name)
|
||||||
|
|
||||||
|
codecs.register(search_for_codec)
|
||||||
|
codecs.register(search_for_codec)
|
||||||
Reference in New Issue
Block a user