Encodes as a list of (char, count) tuples
- Three basic types of data compression.
encodeFileneeds to callencodeStringdecodeFileneeds to calldecodeString
import os
import json
# Encodes as a list of (char, count) tuples
def encodeString(stringVal):
encodedList = []
prevChar = None
count = 0
for char in stringVal:
if prevChar != char and prevChar is not None:
encodedList.append((prevChar, count))
count = 0
prevChar = char
count = count + 1
encodedList.append((prevChar, count))
return encodedList
def decodeString(encodedList):
decodedStr = ''
for item in encodedList:
try:
decodedStr = decodedStr + item[0] * item[1]
except:
print(item)
return decodedStr
def encodeFile(filename, newFilename):
with open(filename) as f:
data = encodeString(f.read())
# We grab the JSON blob from above and is then written to a file like below
with open(newFilename, 'w') as f:
f.write(json.dumps(data))
def decodeFile(filename):
with open(filename) as f:
data = f.read()
return decodeString(json.loads(data))
print(f'Original file size: {os.path.getsize(\10_04_challenge_art.txt\)}'),
# This takes in a new file name
encodeFile('10_04_challenge_art.txt', '10_04_challenge_art_encoded.txt')
print(f'New file size: {os.path.getsize(\10_04_challenge_art_encoded.txt\)}')
print(decodeFile('10_04_challenge_art_encoded.txt'))
- Running the above will then output:
Original file size: 2757 New file size: 2441 Outputs the ASCII art below. - Keep in mind the characters that are in a JSON file, commas, apostrophes, brackets etc all take up space.
- To further improve the compression, we can do:
# [('A', 1), ('B', 80), ('C', 10)] # This outputs | ~ | ~ in that pattern instead # becomes A|1~B|80~C|10 def encodeFile(filename, newFilename): with open(filename) as f: data = encodeString(f.read()) data = [f'{char}|{count}' for char, count in data] with open(newFilename, 'w') as f: f.write('~'.join(data)) # The decoder then splits the data and puts it into pairs def decodeFile(filename): with open(filename) as f: data = f.read() pairs = data.split('~') pairs = [p.split('|') for p in pairs] # This is the stage where it is placed into pairs pairs = [(p[0], int(p[1])) for p in pairs] return decodeString(pairs) - Running that, we then get:
Original file size: 2757 New file size: 1007 Outputs the ASCII art below. - Even more improved solution than the last.
- You can store any integer up to 255 in a single byte (or single character’s worth of data) of data. ``` def encodeFile(filename, newFilename): with open(filename) as f: data = encodeString(f.read()) output = bytearray() for item in data: # Character byte output.extend(bytes(item[0], ‘utf-8’)) # Integer count byte output.extend(item[1].to_bytes(1, ‘big’)) with open(newFilename, ‘wb’) as binary_file: # Write bytes to file binary_file.write(output)
def decodeFile(filename): with open(filename, ‘rb’) as f: data = f.read() # Split data into pairs bytePairs = [data[i:i+2] for i in range(0, len(data), 2)] encodedList = [] for bytePair in bytePairs: encodedList.append((bytePair[:1].decode(‘utf-8’), int.from_bytes(bytePair[1:], ‘big’))) return decodeString(encodedList)
print(f’Original file size: {os.path.getsize(\10_04_challenge_art.txt)}’), encodeFile(‘10_04_challenge_art.txt’, ‘10_04_challenge_art_encoded.aa’)
print(f’New file size: {os.path.getsize(\10_04_challenge_art_encoded.aa)}’) print(decodeFile(‘10_04_challenge_art_encoded.aa’)) ```
- The above outputs the file at
466bytes.