Skip to content

Commit

Permalink
Add AnnoyAlternatingLeastSquares class
Browse files Browse the repository at this point in the history
This class uses an Annoy Index to return the recommended items by
the inner product.

This involves transforming each vector by adding one extra dimension
as suggested in the paper:
"Speeding Up the Xbox Recommender System Using a Euclidean Transformation for
Inner-Product Spaces"
https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf

Basically this involves transforming each feature vector so that they have the same
norm, which means the cosine of this transformed vector is proportional to the
dot product (if the other vector in the cosine has a 0 in the extra dimension).
  • Loading branch information
benfred committed May 14, 2017
1 parent 9fbf1bb commit a50fb1b
Show file tree
Hide file tree
Showing 6 changed files with 115 additions and 28 deletions.
29 changes: 3 additions & 26 deletions examples/lastfm.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@
import logging
import time

import annoy
import numpy
import pandas
from scipy.sparse import coo_matrix

from implicit.als import AlternatingLeastSquares
from implicit.annoy_als import AnnoyAlternatingLeastSquares
from implicit.nearest_neighbours import (BM25Recommender, CosineRecommender,
TFIDFRecommender, bm25_weight)

Expand All @@ -46,32 +46,11 @@ def read_data(filename):
return data, plays


class AnnoyAlternatingLeastSquares(AlternatingLeastSquares):
""" A version of the AlternatingLeastSquares model that uses an annoy
index to calculate similar items. This leads to massive speedups
when called repeatedly """
def fit(self, Ciu):
# train the model
super(AnnoyAlternatingLeastSquares, self).fit(Ciu)

# build up an index with all the item_factors
index = annoy.AnnoyIndex(self.item_factors.shape[1], 'angular')
for i, row in enumerate(self.item_factors):
index.add_item(i, row)
index.build(self.factors / 2)
self.index = index

def similar_items(self, artistid, N=10):
neighbours = self.index.get_nns_by_item(artistid, N)
return sorted(((other, 1 - self.index.get_distance(artistid, other))
for other in neighbours), key=lambda x: -x[1])


def calculate_similar_artists(input_filename, output_filename,
model_name="als",
factors=50, regularization=0.01,
iterations=15,
exact=False, trees=20,
exact=False,
use_native=True,
dtype=numpy.float64,
cg=False):
Expand Down Expand Up @@ -147,8 +126,6 @@ def calculate_similar_artists(input_filename, output_filename,
parser.add_argument('--iter', type=int, default=15, dest='iterations',
help='Number of ALS iterations')
parser.add_argument('--exact', help='compute exact distances (slow)', action="store_true")
parser.add_argument('--trees', type=int, default=20, dest='treecount',
help='Number of trees to use in annoy')
parser.add_argument('--purepython',
help='dont use cython extension (slow)',
action="store_true")
Expand All @@ -166,7 +143,7 @@ def calculate_similar_artists(input_filename, output_filename,
model_name=args.model,
factors=args.factors,
regularization=args.regularization,
exact=args.exact, trees=args.treecount,
exact=args.exact,
iterations=args.iterations,
use_native=not args.purepython,
dtype=numpy.float32 if args.float32 else numpy.float64,
Expand Down
2 changes: 1 addition & 1 deletion implicit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
from . import nearest_neighbours
from . import als

__version__ = '0.2.3'
__version__ = '0.2.4'

__all__ = [alternating_least_squares, als, nearest_neighbours, __version__]
93 changes: 93 additions & 0 deletions implicit/annoy_als.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
""" Uses Annoy (https://github.com/spotify/annoy) to quickly retrieve
approximate neighbours from an ALS Matrix factorization model
"""
import itertools

import annoy
import numpy

from implicit.als import AlternatingLeastSquares


class MaximumInnerProductIndex(object):
""" This class uses an Annoy Index to return the top related items by
the inner product - instead of by the cosine.
This involves transforming each vector by adding one extra dimension
as suggested in the paper:
"Speeding Up the Xbox Recommender System Using a Euclidean Transformation for
Inner-Product Spaces"
https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf
Basically this involves transforming each feature vector so that they have the same
norm, which means the cosine of this transformed vector is proportional to the
dot product (if the other vector in the cosine has a 0 in the extra dimension).
"""
def __init__(self, factors, num_trees=None):
num_trees = num_trees or factors.shape[1]

# figure out the norms/ max norm for each row in the matrix
norms = numpy.linalg.norm(factors, axis=1)
self.max_norm = norms.max()

# add an extra dimension so that the norm of each row is the same (max_norm)
extra_dimension = numpy.sqrt(self.max_norm ** 2 - norms ** 2)
extra = numpy.append(factors, extra_dimension.reshape(norms.shape[0], 1), axis=1)

# add the extended matrix to an annoy index
index = annoy.AnnoyIndex(factors.shape[1] + 1, 'angular')
for i, row in enumerate(extra):
index.add_item(i, row)

index.build(num_trees)
self.index = index

def get_nns_by_vector(self, v, N=10):
return self._get_nns(numpy.append(v, 0), N)

def get_nns_by_item(self, itemid, N=10):
v = self.index.get_item_vector(itemid)
v[-1] = 0
return self._get_nns(v)

def _get_nns(self, v, N=10):
ids, dist = self.index.get_nns_by_vector(v, N, include_distances=True)

# convert the distances from euclidean to cosine distance,
# and then rescale the cosine distance to go back to inner product
scaling = self.max_norm * numpy.linalg.norm(v)
return ids, scaling * (1 - (numpy.array(dist) ** 2) / 2)


class AnnoyAlternatingLeastSquares(AlternatingLeastSquares):
""" A version of the AlternatingLeastSquares model that uses an annoy
index to calculate similar items. This leads to massive speedups
when called repeatedly """
def fit(self, Ciu):
# train the model
super(AnnoyAlternatingLeastSquares, self).fit(Ciu)

# build up an Annoy Index with all the item_factors (for calculating similar items)
self.cosine_index = annoy.AnnoyIndex(self.item_factors.shape[1], 'angular')
for i, row in enumerate(self.item_factors):
self.cosine_index.add_item(i, row)
self.cosine_index.build(self.factors)

# build up a separate index for the inner product (for recommend methods)
self.inner_product_index = MaximumInnerProductIndex(self.item_factors)

def similar_items(self, artistid, N=10):
neighbours, dist = self.cosine_index.get_nns_by_item(artistid, N, include_distances=True)
# transform distances back to cosine from euclidean distance
return zip(neighbours, 1 - (numpy.array(dist) ** 2) / 2)

def recommend(self, userid, user_items, N=10, filter_items=None):
# calculate the top N items, removing the users own liked items from the results
liked = set(user_items[userid].indices)
if filter_items:
liked.update(filter_items)
count = N + len(liked)

# get the top items by dot product
ids, dist = self.inner_product_index.get_nns_by_vector(self.user_factors[userid], count)
return list(itertools.islice((rec for rec in zip(ids, dist) if rec[0] not in liked), N))
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
scipy>=0.16.0
Cython>=0.22.0
annoy>=1.8.0
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from setuptools import Extension, setup

NAME = 'implicit'
VERSION = '0.2.3'
VERSION = '0.2.4'
SRC_ROOT = 'implicit'

try:
Expand Down
16 changes: 16 additions & 0 deletions tests/annoy_als_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from __future__ import print_function

import unittest

from implicit.annoy_als import AnnoyAlternatingLeastSquares

from .recommender_base_test import TestRecommenderBaseMixin


class AnnoyALSTest(unittest.TestCase, TestRecommenderBaseMixin):
def _get_model(self):
return AnnoyAlternatingLeastSquares(factors=3, regularization=0)


if __name__ == "__main__":
unittest.main()

0 comments on commit a50fb1b

Please sign in to comment.