-
-
Save brunovianarezende/1262559 to your computer and use it in GitHub Desktop.
Slope One
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys, csv, time | |
| from collections import defaultdict | |
| from itertools import islice | |
| import orig_slopeone, slopeone #,slsparse | |
| classes = [ | |
| orig_slopeone.SlopeOne, | |
| #orig_slopeone.SlopeOneNumpy, # XXX: dog slow | |
| slopeone.SlopeOne, | |
| slopeone.SlopeOne2, | |
| slopeone.SlopeOneSym, | |
| slopeone.SlopeOneSym2, | |
| ] | |
| class ItemsStorage(object): | |
| def __init__(self, maxsize=None): | |
| self.items = [] | |
| self.item2id = {} | |
| self.maxsize = maxsize | |
| def add_item(self, item): | |
| id = self.item2id.get(item) | |
| if id is None: | |
| n = len(self.items) | |
| if self.maxsize is None or n < self.maxsize: | |
| self.item2id[item] = n | |
| self.items.append(item) | |
| id = n | |
| return id | |
| def id(self, item): | |
| return self.item2id[item] | |
| def item(self, id): | |
| return self.items[id] | |
| def __len__(self): | |
| return len(self.items) | |
| def load_data(max_num_items=None): | |
| users_ratings = defaultdict(dict) | |
| items_storage = ItemsStorage(max_num_items) | |
| reader = csv.reader(open('BX-Book-Ratings.csv'), delimiter=';') | |
| reader.next() # skip header | |
| for user_id, isbn, rating in reader: | |
| item_id = items_storage.add_item(isbn) | |
| rating = int(rating) | |
| if item_id is None or rating == 0: | |
| continue | |
| users_ratings[user_id][item_id] = rating | |
| return len(items_storage), users_ratings | |
| def main(max_num_items, num_users): | |
| top = 5 | |
| num_items, user_ratings = load_data(max_num_items) | |
| print >> sys.stderr, '%d items / %d users / %d ratings' % ( | |
| num_items, len(user_ratings), | |
| sum(len(r) for r in user_ratings.itervalues())) | |
| for cls in classes: | |
| print >> sys.stderr, '%s.%s' % (cls.__module__, cls.__name__) | |
| slopone = cls(num_items) | |
| start = time.time() | |
| slopone.update(user_ratings.itervalues()) | |
| print >> sys.stderr, ' training time: %s' % (time.time() - start) | |
| start = time.time() | |
| for itemid2rating in islice(user_ratings.itervalues(), num_users): | |
| recommendations = slopone.recomendations(itemid2rating, top) | |
| print >> sys.stderr, ' prediction time (top %d for %d users): %s' % ( | |
| top, num_users, time.time() - start) | |
| if __name__ == '__main__': | |
| main(*map(int, sys.argv[1:])) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from heapq import nlargest | |
| from numpy import int32, float32 | |
| from scipy.sparse import dok_matrix | |
| class SlopeOne(object): | |
| def __init__(self, num_items): | |
| self.diffs = {} | |
| self.freqs = {} | |
| def recomendations(self, useritems, top=None): | |
| preds, freqs = {}, {} | |
| useritems = dict(useritems) | |
| for item, rating in useritems.iteritems(): | |
| for diffitem, diffratings in self.diffs.iteritems(): | |
| try: | |
| freq = self.freqs[diffitem][item] | |
| except KeyError: | |
| continue | |
| preds.setdefault(diffitem, 0.0) | |
| freqs.setdefault(diffitem, 0) | |
| preds[diffitem] += freq * (diffratings[item] + rating) | |
| freqs[diffitem] += freq | |
| predlist = [(value / freqs[item], item) | |
| for item, value in preds.iteritems() | |
| if item not in useritems and freqs[item] > 0] | |
| if top is not None: | |
| predlist = nlargest(top, predlist) | |
| else: | |
| predlist.sort(key=lambda x: (-x[0], -x[1])) | |
| return predlist | |
| def update(self, userdata): | |
| for userratings in userdata: | |
| for item1, rating1 in userratings.iteritems(): | |
| self.freqs.setdefault(item1, {}) | |
| self.diffs.setdefault(item1, {}) | |
| for item2, rating2 in userratings.iteritems(): | |
| self.freqs[item1].setdefault(item2, 0) | |
| self.diffs[item1].setdefault(item2, 0.0) | |
| self.freqs[item1][item2] += 1 | |
| self.diffs[item1][item2] += rating1 - rating2 | |
| for item1, ratings in self.diffs.iteritems(): | |
| for item2 in ratings: | |
| ratings[item2] /= self.freqs[item1][item2] | |
| class SlopeOneNumpy(object): | |
| def __init__(self, num_items): | |
| self.freqs = dok_matrix((num_items, num_items), dtype=int32) | |
| self.diffs = dok_matrix((num_items, num_items), dtype=float32) | |
| def recomendations(self, useritems, top=None): | |
| resultpreds, resultfreqs = {}, {} | |
| useritems = dict(useritems) | |
| for id, rating in useritems.iteritems(): | |
| freqrow = self.freqs.getrow(id) | |
| for freq, indice in zip(freqrow.data, freqrow.indices): | |
| resultpreds.setdefault(indice, 0.0) | |
| resultfreqs.setdefault(indice, 0) | |
| resultpreds[indice] += freq * (self.diffs[indice, id] + rating) | |
| resultfreqs[indice] += freq | |
| predlist = [(value / resultfreqs[item], item) | |
| for item, value in resultpreds.iteritems() | |
| if item not in useritems and resultfreqs[item] > 0] | |
| if top is not None: | |
| predlist = nlargest(top, predlist) | |
| else: | |
| predlist.sort(key=lambda x: (-x[0], -x[1])) | |
| return predlist | |
| def update(self, userdata): | |
| for userratings in userdata: | |
| for item1, rating1 in userratings.iteritems(): | |
| for item2, rating2 in userratings.iteritems(): | |
| self.freqs[item1, item2] += 1 | |
| delta = float32(rating1 - rating2) | |
| if delta != 0: | |
| # there is a bug in scipy when assign a 0 value for a already zero entry | |
| self.diffs[item1, item2] += delta | |
| for key, rating in self.diffs.iteritems(): | |
| self.diffs[key] = rating/self.freqs[key] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from __future__ import division | |
| from collections import defaultdict | |
| from heapq import nlargest | |
| from operator import itemgetter | |
| class SlopeOne(object): | |
| def __init__(self, num_items): | |
| self.freqs = defaultdict(lambda: defaultdict(int)) | |
| self.diffs = defaultdict(lambda: defaultdict(int)) | |
| def update(self, users): | |
| freqs, diffs = self.freqs, self.diffs | |
| for user in users: | |
| item_ratings = sorted(user.iteritems(), key=itemgetter(0)) | |
| while item_ratings: | |
| item1, rating1 = item_ratings.pop() | |
| item1_freqs = freqs[item1] | |
| item1_diffs = diffs[item1] | |
| for item2, rating2 in item_ratings: | |
| item1_freqs[item2] += 1 | |
| freqs[item2][item1] += 1 | |
| diff = rating1 - rating2 | |
| if diff: | |
| item1_diffs[item2] += diff | |
| diffs[item2][item1] -= diff | |
| for item1, ratings in diffs.iteritems(): | |
| item1_freqs = freqs[item1] | |
| for item2 in ratings: | |
| ratings[item2] /= item1_freqs[item2] | |
| def recomendations(self, item2rating, top=None): | |
| freqs, diffs = self.freqs, self.diffs | |
| predfreqs = defaultdict(int) | |
| preds = defaultdict(int) | |
| for olditem in freqs: | |
| if olditem in item2rating: | |
| continue | |
| freqs_olditem = freqs[olditem] | |
| for item in item2rating: | |
| freq = freqs_olditem.get(item) | |
| if freq is not None: | |
| predfreqs[olditem] += freq | |
| diffrating = diffs[olditem][item] | |
| preds[olditem] += freq * (item2rating[item] + diffrating) | |
| results = ((preds[item]/predfreqs[item], item) for item in preds) | |
| if top is not None: | |
| results = nlargest(top, results) | |
| else: | |
| results = sorted(results, reverse=True) | |
| return results | |
| def dump(self): | |
| diffs = self.diffs | |
| key = itemgetter(0) | |
| for i, rows in sorted(self.freqs.iteritems(), key=key): | |
| diff_i = diffs[i] | |
| for j, freq in sorted(rows.iteritems(), key=key): | |
| print i, j, freq, diff_i[j] | |
| class SlopeOne2(SlopeOne): | |
| def update(self, users): | |
| freqs, diffs = self.freqs, self.diffs | |
| for user in users: | |
| for item1, rating1 in user.iteritems(): | |
| item1_freqs = freqs[item1] | |
| item1_diffs = diffs[item1] | |
| for item2, rating2 in user.iteritems(): | |
| item1_freqs[item2] += 1 | |
| item1_diffs[item2] += rating1 - rating2 | |
| for item1, ratings in diffs.iteritems(): | |
| item1_freqs = freqs[item1] | |
| for item2 in ratings: | |
| ratings[item2] /= item1_freqs[item2] | |
| class SlopeOneSym(SlopeOne): | |
| def update(self, users): | |
| freqs, diffs = self.freqs, self.diffs | |
| for user in users: | |
| item_ratings = sorted(user.iteritems(), key=itemgetter(0)) | |
| while item_ratings: | |
| item1, rating1 = item_ratings.pop() | |
| item1_freqs = freqs[item1] | |
| item1_diffs = diffs[item1] | |
| for item2, rating2 in item_ratings: | |
| item1_freqs[item2] += 1 | |
| item1_diffs[item2] += rating1 - rating2 | |
| for item1, ratings in diffs.iteritems(): | |
| item1_freqs = freqs[item1] | |
| for item2 in ratings: | |
| ratings[item2] /= item1_freqs[item2] | |
| def recomendations(self, item2rating, top=None): | |
| freqs, diffs = self.freqs, self.diffs | |
| predfreqs = defaultdict(int) | |
| preds = defaultdict(int) | |
| for olditem in diffs: | |
| if olditem in item2rating: | |
| continue | |
| freqs_olditem = freqs[olditem] | |
| for item in item2rating: | |
| if olditem > item: | |
| freq = freqs_olditem.get(item) | |
| else: | |
| freq = freqs[item].get(olditem) | |
| if freq is not None: | |
| predfreqs[olditem] += freq | |
| if olditem > item: | |
| diffrating = diffs[olditem][item] | |
| else: | |
| diffrating = -diffs[item][olditem] | |
| preds[olditem] += freq * (item2rating[item] + diffrating) | |
| results = ((preds[item]/predfreqs[item], item) for item in preds) | |
| if top is not None: | |
| results = nlargest(top, results) | |
| else: | |
| results = sorted(results, reverse=True) | |
| return results | |
| class SlopeOneSym2(SlopeOneSym): | |
| def update(self, users): | |
| freqs, diffs = self.freqs, self.diffs | |
| for user in users: | |
| for item1, rating1 in user.iteritems(): | |
| item1_freqs = freqs[item1] | |
| item1_diffs = diffs[item1] | |
| for item2 in user: | |
| if item2 >= item1: | |
| continue | |
| item1_freqs[item2] += 1 | |
| item1_diffs[item2] += rating1 - user[item2] | |
| for item1, ratings in diffs.iteritems(): | |
| item1_freqs = freqs[item1] | |
| for item2 in ratings: | |
| ratings[item2] /= item1_freqs[item2] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import unittest | |
| import slopeone | |
| import orig_slopeone | |
| from benchmark import ItemsStorage | |
| class SlopeOneTest(unittest.TestCase): | |
| users = [ | |
| dict(b1=10, b2=5, b3=7, b4=0, b5=8), | |
| dict(b1=3, b3=6, b5=9, b7=1), | |
| dict(b2=4, b4=6, b6=8), | |
| dict(b5=2, b6=3, b7=4, b8=5, b9=6), | |
| dict(b8=10, b9=2, b10=8), | |
| dict(b4=3, b5=5, b6=7), | |
| ] | |
| recommendations = [ | |
| [('b9', 12), ('b8', 11), ('b6', 6.8), ('b7', 5)], | |
| [('b9', 8), ('b8', 7), ('b6', 7), ('b2', 8/3.), ('b4', 0)], | |
| [('b1', 12.5), ('b9', 11), ('b8', 10), ('b3', 9.5), ('b7', 9.0), ('b5', 8.4)], | |
| [('b10', 7.5), ('b3', 3), ('b1', 2), ('b2', -1), ('b4', -1.5)], | |
| [('b7', 4.5), ('b6', 3.5), ('b5', 2.5)], | |
| [('b9', 9.5), ('b8', 8.5), ('b1', 19/3.), ('b3', 16/3.), ('b7', 4), ('b2', 3.5)], | |
| ] | |
| def setUp(self): | |
| self.items_storage = items_storage = ItemsStorage() | |
| for user in self.users: | |
| for book_id in user.iterkeys(): | |
| items_storage.add_item(book_id) | |
| self.ratings = map(self._itemid2rating, self.users) | |
| def test_slope_one(self): | |
| self._test(slopeone.SlopeOne) | |
| def test_slope_one2(self): | |
| self._test(slopeone.SlopeOne2) | |
| def test_slope_one_sym(self): | |
| self._test(slopeone.SlopeOneSym) | |
| def test_slope_one_sym2(self): | |
| self._test(slopeone.SlopeOneSym2) | |
| def test_orig_slope_one(self): | |
| self._test(orig_slopeone.SlopeOne) | |
| def test_orig_slope_one_numpy(self): | |
| self._test(orig_slopeone.SlopeOneNumpy) | |
| def _itemid2rating(self, user): | |
| return dict((self.items_storage.id(book_id), rating) | |
| for book_id, rating in user.iteritems()) | |
| def _test(self, cls): | |
| slopeone = cls(len(self.items_storage)) | |
| slopeone.update(self.ratings) | |
| for user, recommendations in zip(self.users, self.recommendations): | |
| for top in None, 1, 2, 3: | |
| predictions = [ | |
| (self.items_storage.item(item_id), rating) | |
| for rating, item_id in | |
| slopeone.recomendations(self._itemid2rating(user), top=top) | |
| ] | |
| self.assertEqual(predictions, recommendations[:top]) | |
| class SmallSlopeOneTest(SlopeOneTest): | |
| users = [ | |
| dict(b1=5, b2=3, b3=2), | |
| dict(b1=3, b2=4), | |
| dict(b2=2, b3=5), | |
| ] | |
| recommendations = [ | |
| [], | |
| [('b3', 10/3.)], | |
| [('b1', 13/3.)], | |
| ] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment