generated from mwc/lab_compression
implemented ecoding/decoding in commonchars.py
This commit is contained in:
93
text_codecs/commonchars.py
Normal file
93
text_codecs/commonchars.py
Normal file
@@ -0,0 +1,93 @@
|
||||
from easybits import Bits
|
||||
import codecs
|
||||
|
||||
def encode(text):
|
||||
"An encoder which only handles ASCII: non-ASCII characters are replaced with '?'. "
|
||||
"this will remove the most common characters in english texts [e, t, a, o, i, n, s, h] and by writing a 1 and followed by 3 bits"
|
||||
"other charcaters will be written with stadard 8 bit code"
|
||||
"this ecoding has about a 77 percent compression rate"
|
||||
"there is some loss in quality since unknown characters are replaced with ? and not decoded"
|
||||
|
||||
result = Bits()
|
||||
commonchars = ['e', 't', 'a', 'o', 'i', 'n', 's', 'h']
|
||||
for char in text:
|
||||
try:
|
||||
b = Bits(char, encoding='ascii')
|
||||
except UnicodeEncodeError:
|
||||
b = Bits('?', encoding='ascii')
|
||||
char = '?'
|
||||
if char in commonchars:
|
||||
result = result.concat(Bits('1'))
|
||||
i = commonchars.index(char)
|
||||
if i == 0:
|
||||
result = result.concat(Bits('000'))
|
||||
elif i == 1:
|
||||
result = result.concat(Bits("001"))
|
||||
elif i == 2:
|
||||
result = result.concat(Bits("010"))
|
||||
elif i == 3:
|
||||
result = result.concat(Bits("011"))
|
||||
elif i == 4:
|
||||
result = result.concat(Bits("100"))
|
||||
elif i == 5:
|
||||
result = result.concat(Bits("101"))
|
||||
elif i == 6:
|
||||
result = result.concat(Bits("110"))
|
||||
elif i == 7:
|
||||
result = result.concat(Bits("111"))
|
||||
else:
|
||||
result = result.concat(b)
|
||||
return result.bytes
|
||||
|
||||
def decode(data):
|
||||
"Matching decoder. Checks if first bit is 1 or 0. If it is 1, then it changes character out of encoded form, which is 3 bits long. "
|
||||
"If not, it just converts it using standard ascii which is 8 bits."
|
||||
bits = Bits(bytes(data))
|
||||
text = ""
|
||||
commonchars = ['e', 't', 'a', 'o', 'i', 'n', 's', 'h']
|
||||
i = 0
|
||||
while i < len(bits):
|
||||
if bits[i] == 1:
|
||||
byte = bits[i+1:i+4]
|
||||
if byte == Bits('000'):
|
||||
text += commonchars[0]
|
||||
elif byte == Bits('001'):
|
||||
text += commonchars[1]
|
||||
elif byte == Bits('010'):
|
||||
text += commonchars[2]
|
||||
elif byte == Bits('011'):
|
||||
text += commonchars[3]
|
||||
elif byte == Bits('100'):
|
||||
text += commonchars[4]
|
||||
elif byte == Bits('101'):
|
||||
text += commonchars[5]
|
||||
elif byte == Bits('110'):
|
||||
text += commonchars[6]
|
||||
elif byte == Bits('111'):
|
||||
text += commonchars[7]
|
||||
i += 4
|
||||
|
||||
else:
|
||||
b = bits[i:i+8]
|
||||
text += b.ascii
|
||||
i += 8
|
||||
|
||||
return text
|
||||
|
||||
def register_codec(encode, decode, name):
|
||||
"""Registers a codec so that it can later be used to encode
|
||||
or decode strings and bytes.
|
||||
"""
|
||||
def encode_wrapper(text):
|
||||
return encode(text), len(text)
|
||||
|
||||
def decode_wrapper(data):
|
||||
return decode(data), len(data)
|
||||
|
||||
def search_for_codec(query):
|
||||
if query == name:
|
||||
return codecs.CodecInfo(encode_wrapper, decode_wrapper, name=name)
|
||||
|
||||
codecs.register(search_for_codec)
|
||||
|
||||
register_codec(encode, decode, "commonchars")
|
||||
Reference in New Issue
Block a user