#!/usr/bin/env python # coding: utf-8 # In[3]: import pickle import os import sys from datetime import datetime # In[ ]: def update_ranges(range_low,range_high): digits = [] while True: if int(range_low * 10) == int(range_high*10): digits.append((str(int(range_low*10)))) range_low = range_low*10 range_high = range_high*10 range_low = range_low % 1 range_high = range_high % 1 else: break return range_low,range_high,digits def reset_ranges(counter_dict,range_low,range_high): diff = range_high - range_low for first_char in counter_dict.keys(): for second_char in counter_dict[first_char].keys(): high = 0 low = range_low for prediction_char in sorted(counter_dict[first_char][second_char].keys()): value = counter_dict[first_char][second_char][prediction_char] per = value[1] high = low + per * diff value[2] = (low,high) counter_dict[first_char][second_char][prediction_char] = value low = high # In[4]: def get_range(counter_dict,first_char,second_char,prediction_char,low,high): static_low,static_high = counter_dict[first_char][second_char][prediction_char][2] diff = high - low new_low = (static_low * diff) + low new_high = (static_high * diff) + low return new_low, new_high def compress(filepath): try: with open(filepath,'r',encoding='utf-8') as f: text = f.read() except: print('Could not read the file!') return try: os.mkdir(os.path.abspath(os.getcwd()+'/'+'compression_files')) except: pass counter_dict = {} for i in range(len(text)-2): first_char = text[i] second_char = text[i+1] prediction_char = text[i+2] if first_char in counter_dict: if second_char in counter_dict[first_char]: if prediction_char in counter_dict[first_char][second_char]: counter_dict[first_char][second_char][prediction_char] += 1 else: counter_dict[first_char][second_char][prediction_char] = 1 else: counter_dict[first_char][second_char] = {prediction_char:1} else: counter_dict[first_char] = {second_char:{prediction_char:1}} #Save the dictionary with open(os.path.abspath(os.getcwd())+'/'+'compression_files'+'/'+'dictionary.pickle','wb') as file: pickle.dump(counter_dict,file,protocol=pickle.HIGHEST_PROTOCOL) for first_char in sorted(counter_dict.keys()): for second_char in sorted(counter_dict[first_char].keys()): second_char_count = sum([v for k,v in counter_dict[first_char][second_char].items()]) for prediction_char in sorted(counter_dict[first_char][second_char].keys()): value = counter_dict[first_char][second_char][prediction_char] per = value/second_char_count counter_dict[first_char][second_char][prediction_char] = [value,per,(0,0)] value_ls = [] genesis_chars = text[0] + text[1] reset_ranges(counter_dict,range_low=0.0,range_high=1.0) range_low = 0.0 range_high = 1.0 for i in range(len(text)-2): first_char = text[i] second_char = text[i+1] prediction_char = text[i+2] range_low,range_high = get_range(counter_dict,first_char,second_char,prediction_char,range_low,range_high) #update range range_low,range_high,ls = update_ranges(range_low,range_high) value_ls += ls value = (range_low + range_high)/2 value_str = str(value)[2:4] value = '0.'+''.join(value_ls)+value_str tostore = int(value[2:]) with open(os.path.abspath(os.getcwd())+'/'+'compression_files'+'/'+'integer_value.pickle', 'wb') as f: pickle.dump(tostore, f) with open(os.path.abspath(os.getcwd())+'/'+'compression_files'+'/'+'genesis_chars.pickle','wb') as f: pickle.dump(genesis_chars,f) with open(os.path.abspath(os.getcwd())+'/'+'compression_files'+'/'+'len_of_text','wb') as f: pickle.dump(len(text)-2,f) print('Compression successful!') # In[5]: if __name__ == '__main__': if (len(sys.argv)!= 2): print('Please give input file path! (or just name, if file in current directory)') exit() filepath = os.path.abspath(sys.argv[1]) compress(filepath) # In[ ]: