Skip to content

Instantly share code, notes, and snippets.

@RafalKucharskiPK
Created November 16, 2022 09:31
Show Gist options
  • Select an option

  • Save RafalKucharskiPK/841f3a52afa3bac1c72bf4bc7f874436 to your computer and use it in GitHub Desktop.

Select an option

Save RafalKucharskiPK/841f3a52afa3bac1c72bf4bc7f874436 to your computer and use it in GitHub Desktop.
bow
import io
from collections import defaultdict
import json
import pickle
class BagOfWords():
def __init__(self, txt=''):
self._d = defaultdict(int)
if isinstance(txt, io.TextIOWrapper):
for line in txt:
for word in line.split():
self._d[word] += 1
elif isinstance(txt, str):
if txt.endswith('.json'):
with open(txt, 'r') as f:
self._d = defaultdict(int,dict(json.load(f)))
else:
for word in txt.split():
self._d[word] += 1
self.index = 0
def __getitem__(self, key):
return self._d[key]
def __setitem__(self, key, value):
self._d[key] = value
def __str__(self):
return ', '.join(f"{k}:{v}" for k, v in self._d.items())
def __contains__(self, key):
return key in self._d
def __iter__(self):
return iter(sorted(self._d, key=self._d.get, reverse=True))
def __next__(self):
if self.index == len(self._d):
raise StopIteration
val = list(self._d)[self.index]
self.index = self.index + 1
return val
def __add__(self, second_bag):
new_bag = BagOfWords()
new_bag._d = self.add(self._d, second_bag._d)
return new_bag
def save(self, filename='dump.json'):
with open(filename, 'w') as fp:
json.dump(self._d, fp)
def pickle(self, filename = 'pickle.pickle'):
with open(filename, "wb") as pfile:
pickle.dump(self, pfile)
@staticmethod
def add(d1, d2):
return defaultdict(int, {**d1, **d2, **{k: d1[k] + d2[k] for k in set(d1) & set(d2)}})
obj1 = BagOfWords(open('hamlet.txt'))
# obj2 = BagOfWords('tomek janek ala itp i')
# print("Test 1[create from file]:")
# print(obj1, end='\n\n')
# print("Test 2[create from str]:")
# print(obj2, end='\n\n')
# print("Test 3[add]:")
# obj3 = obj1 + obj2
# print(obj3, end='\n\n')
# print("Test 4[setitem and getitem]:")
# obj3["tomek"] = 15
# print(obj3["tomek"], end='\n\n')
# print("Test 4[for in]:")
# for word in obj3:
# print(word)
obj1.save()
obj2 = BagOfWords('dump.json')
obj2.pickle()
with open('pickle.pickle', "rb") as pfile:
lst = pickle.load(pfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment