Skip to content

Instantly share code, notes, and snippets.

@brunovianarezende
Forked from gsakkis/benchmark.py
Created October 4, 2011 19:34
Show Gist options
  • Select an option

  • Save brunovianarezende/1262559 to your computer and use it in GitHub Desktop.

Select an option

Save brunovianarezende/1262559 to your computer and use it in GitHub Desktop.
Slope One
import sys, csv, time
from collections import defaultdict
from itertools import islice
import orig_slopeone, slopeone #,slsparse
classes = [
orig_slopeone.SlopeOne,
#orig_slopeone.SlopeOneNumpy, # XXX: dog slow
slopeone.SlopeOne,
slopeone.SlopeOne2,
slopeone.SlopeOneSym,
slopeone.SlopeOneSym2,
]
class ItemsStorage(object):
def __init__(self, maxsize=None):
self.items = []
self.item2id = {}
self.maxsize = maxsize
def add_item(self, item):
id = self.item2id.get(item)
if id is None:
n = len(self.items)
if self.maxsize is None or n < self.maxsize:
self.item2id[item] = n
self.items.append(item)
id = n
return id
def id(self, item):
return self.item2id[item]
def item(self, id):
return self.items[id]
def __len__(self):
return len(self.items)
def load_data(max_num_items=None):
users_ratings = defaultdict(dict)
items_storage = ItemsStorage(max_num_items)
reader = csv.reader(open('BX-Book-Ratings.csv'), delimiter=';')
reader.next() # skip header
for user_id, isbn, rating in reader:
item_id = items_storage.add_item(isbn)
rating = int(rating)
if item_id is None or rating == 0:
continue
users_ratings[user_id][item_id] = rating
return len(items_storage), users_ratings
def main(max_num_items, num_users):
top = 5
num_items, user_ratings = load_data(max_num_items)
print >> sys.stderr, '%d items / %d users / %d ratings' % (
num_items, len(user_ratings),
sum(len(r) for r in user_ratings.itervalues()))
for cls in classes:
print >> sys.stderr, '%s.%s' % (cls.__module__, cls.__name__)
slopone = cls(num_items)
start = time.time()
slopone.update(user_ratings.itervalues())
print >> sys.stderr, ' training time: %s' % (time.time() - start)
start = time.time()
for itemid2rating in islice(user_ratings.itervalues(), num_users):
recommendations = slopone.recomendations(itemid2rating, top)
print >> sys.stderr, ' prediction time (top %d for %d users): %s' % (
top, num_users, time.time() - start)
if __name__ == '__main__':
main(*map(int, sys.argv[1:]))
from heapq import nlargest
from numpy import int32, float32
from scipy.sparse import dok_matrix
class SlopeOne(object):
def __init__(self, num_items):
self.diffs = {}
self.freqs = {}
def recomendations(self, useritems, top=None):
preds, freqs = {}, {}
useritems = dict(useritems)
for item, rating in useritems.iteritems():
for diffitem, diffratings in self.diffs.iteritems():
try:
freq = self.freqs[diffitem][item]
except KeyError:
continue
preds.setdefault(diffitem, 0.0)
freqs.setdefault(diffitem, 0)
preds[diffitem] += freq * (diffratings[item] + rating)
freqs[diffitem] += freq
predlist = [(value / freqs[item], item)
for item, value in preds.iteritems()
if item not in useritems and freqs[item] > 0]
if top is not None:
predlist = nlargest(top, predlist)
else:
predlist.sort(key=lambda x: (-x[0], -x[1]))
return predlist
def update(self, userdata):
for userratings in userdata:
for item1, rating1 in userratings.iteritems():
self.freqs.setdefault(item1, {})
self.diffs.setdefault(item1, {})
for item2, rating2 in userratings.iteritems():
self.freqs[item1].setdefault(item2, 0)
self.diffs[item1].setdefault(item2, 0.0)
self.freqs[item1][item2] += 1
self.diffs[item1][item2] += rating1 - rating2
for item1, ratings in self.diffs.iteritems():
for item2 in ratings:
ratings[item2] /= self.freqs[item1][item2]
class SlopeOneNumpy(object):
def __init__(self, num_items):
self.freqs = dok_matrix((num_items, num_items), dtype=int32)
self.diffs = dok_matrix((num_items, num_items), dtype=float32)
def recomendations(self, useritems, top=None):
resultpreds, resultfreqs = {}, {}
useritems = dict(useritems)
for id, rating in useritems.iteritems():
freqrow = self.freqs.getrow(id)
for freq, indice in zip(freqrow.data, freqrow.indices):
resultpreds.setdefault(indice, 0.0)
resultfreqs.setdefault(indice, 0)
resultpreds[indice] += freq * (self.diffs[indice, id] + rating)
resultfreqs[indice] += freq
predlist = [(value / resultfreqs[item], item)
for item, value in resultpreds.iteritems()
if item not in useritems and resultfreqs[item] > 0]
if top is not None:
predlist = nlargest(top, predlist)
else:
predlist.sort(key=lambda x: (-x[0], -x[1]))
return predlist
def update(self, userdata):
for userratings in userdata:
for item1, rating1 in userratings.iteritems():
for item2, rating2 in userratings.iteritems():
self.freqs[item1, item2] += 1
delta = float32(rating1 - rating2)
if delta != 0:
# there is a bug in scipy when assign a 0 value for a already zero entry
self.diffs[item1, item2] += delta
for key, rating in self.diffs.iteritems():
self.diffs[key] = rating/self.freqs[key]
from __future__ import division
from collections import defaultdict
from heapq import nlargest
from operator import itemgetter
class SlopeOne(object):
def __init__(self, num_items):
self.freqs = defaultdict(lambda: defaultdict(int))
self.diffs = defaultdict(lambda: defaultdict(int))
def update(self, users):
freqs, diffs = self.freqs, self.diffs
for user in users:
item_ratings = sorted(user.iteritems(), key=itemgetter(0))
while item_ratings:
item1, rating1 = item_ratings.pop()
item1_freqs = freqs[item1]
item1_diffs = diffs[item1]
for item2, rating2 in item_ratings:
item1_freqs[item2] += 1
freqs[item2][item1] += 1
diff = rating1 - rating2
if diff:
item1_diffs[item2] += diff
diffs[item2][item1] -= diff
for item1, ratings in diffs.iteritems():
item1_freqs = freqs[item1]
for item2 in ratings:
ratings[item2] /= item1_freqs[item2]
def recomendations(self, item2rating, top=None):
freqs, diffs = self.freqs, self.diffs
predfreqs = defaultdict(int)
preds = defaultdict(int)
for olditem in freqs:
if olditem in item2rating:
continue
freqs_olditem = freqs[olditem]
for item in item2rating:
freq = freqs_olditem.get(item)
if freq is not None:
predfreqs[olditem] += freq
diffrating = diffs[olditem][item]
preds[olditem] += freq * (item2rating[item] + diffrating)
results = ((preds[item]/predfreqs[item], item) for item in preds)
if top is not None:
results = nlargest(top, results)
else:
results = sorted(results, reverse=True)
return results
def dump(self):
diffs = self.diffs
key = itemgetter(0)
for i, rows in sorted(self.freqs.iteritems(), key=key):
diff_i = diffs[i]
for j, freq in sorted(rows.iteritems(), key=key):
print i, j, freq, diff_i[j]
class SlopeOne2(SlopeOne):
def update(self, users):
freqs, diffs = self.freqs, self.diffs
for user in users:
for item1, rating1 in user.iteritems():
item1_freqs = freqs[item1]
item1_diffs = diffs[item1]
for item2, rating2 in user.iteritems():
item1_freqs[item2] += 1
item1_diffs[item2] += rating1 - rating2
for item1, ratings in diffs.iteritems():
item1_freqs = freqs[item1]
for item2 in ratings:
ratings[item2] /= item1_freqs[item2]
class SlopeOneSym(SlopeOne):
def update(self, users):
freqs, diffs = self.freqs, self.diffs
for user in users:
item_ratings = sorted(user.iteritems(), key=itemgetter(0))
while item_ratings:
item1, rating1 = item_ratings.pop()
item1_freqs = freqs[item1]
item1_diffs = diffs[item1]
for item2, rating2 in item_ratings:
item1_freqs[item2] += 1
item1_diffs[item2] += rating1 - rating2
for item1, ratings in diffs.iteritems():
item1_freqs = freqs[item1]
for item2 in ratings:
ratings[item2] /= item1_freqs[item2]
def recomendations(self, item2rating, top=None):
freqs, diffs = self.freqs, self.diffs
predfreqs = defaultdict(int)
preds = defaultdict(int)
for olditem in diffs:
if olditem in item2rating:
continue
freqs_olditem = freqs[olditem]
for item in item2rating:
if olditem > item:
freq = freqs_olditem.get(item)
else:
freq = freqs[item].get(olditem)
if freq is not None:
predfreqs[olditem] += freq
if olditem > item:
diffrating = diffs[olditem][item]
else:
diffrating = -diffs[item][olditem]
preds[olditem] += freq * (item2rating[item] + diffrating)
results = ((preds[item]/predfreqs[item], item) for item in preds)
if top is not None:
results = nlargest(top, results)
else:
results = sorted(results, reverse=True)
return results
class SlopeOneSym2(SlopeOneSym):
def update(self, users):
freqs, diffs = self.freqs, self.diffs
for user in users:
for item1, rating1 in user.iteritems():
item1_freqs = freqs[item1]
item1_diffs = diffs[item1]
for item2 in user:
if item2 >= item1:
continue
item1_freqs[item2] += 1
item1_diffs[item2] += rating1 - user[item2]
for item1, ratings in diffs.iteritems():
item1_freqs = freqs[item1]
for item2 in ratings:
ratings[item2] /= item1_freqs[item2]
import unittest
import slopeone
import orig_slopeone
from benchmark import ItemsStorage
class SlopeOneTest(unittest.TestCase):
users = [
dict(b1=10, b2=5, b3=7, b4=0, b5=8),
dict(b1=3, b3=6, b5=9, b7=1),
dict(b2=4, b4=6, b6=8),
dict(b5=2, b6=3, b7=4, b8=5, b9=6),
dict(b8=10, b9=2, b10=8),
dict(b4=3, b5=5, b6=7),
]
recommendations = [
[('b9', 12), ('b8', 11), ('b6', 6.8), ('b7', 5)],
[('b9', 8), ('b8', 7), ('b6', 7), ('b2', 8/3.), ('b4', 0)],
[('b1', 12.5), ('b9', 11), ('b8', 10), ('b3', 9.5), ('b7', 9.0), ('b5', 8.4)],
[('b10', 7.5), ('b3', 3), ('b1', 2), ('b2', -1), ('b4', -1.5)],
[('b7', 4.5), ('b6', 3.5), ('b5', 2.5)],
[('b9', 9.5), ('b8', 8.5), ('b1', 19/3.), ('b3', 16/3.), ('b7', 4), ('b2', 3.5)],
]
def setUp(self):
self.items_storage = items_storage = ItemsStorage()
for user in self.users:
for book_id in user.iterkeys():
items_storage.add_item(book_id)
self.ratings = map(self._itemid2rating, self.users)
def test_slope_one(self):
self._test(slopeone.SlopeOne)
def test_slope_one2(self):
self._test(slopeone.SlopeOne2)
def test_slope_one_sym(self):
self._test(slopeone.SlopeOneSym)
def test_slope_one_sym2(self):
self._test(slopeone.SlopeOneSym2)
def test_orig_slope_one(self):
self._test(orig_slopeone.SlopeOne)
def test_orig_slope_one_numpy(self):
self._test(orig_slopeone.SlopeOneNumpy)
def _itemid2rating(self, user):
return dict((self.items_storage.id(book_id), rating)
for book_id, rating in user.iteritems())
def _test(self, cls):
slopeone = cls(len(self.items_storage))
slopeone.update(self.ratings)
for user, recommendations in zip(self.users, self.recommendations):
for top in None, 1, 2, 3:
predictions = [
(self.items_storage.item(item_id), rating)
for rating, item_id in
slopeone.recomendations(self._itemid2rating(user), top=top)
]
self.assertEqual(predictions, recommendations[:top])
class SmallSlopeOneTest(SlopeOneTest):
users = [
dict(b1=5, b2=3, b3=2),
dict(b1=3, b2=4),
dict(b2=2, b3=5),
]
recommendations = [
[],
[('b3', 10/3.)],
[('b1', 13/3.)],
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment