Skip to content

Commit

Permalink
Fix more issues with pickling GPU models (#636)
Browse files Browse the repository at this point in the history
* Fix more issues with pickling GPU models

The last round of fixes didn't properly save parameters (like regularization etc)
on the ALS models - and just relied on the default. Fix.

* add a unittest, and fix bug in test_recalculate_after_cpu_conversion
  • Loading branch information
benfred authored Dec 10, 2022
1 parent 8074a91 commit 7948029
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 24 deletions.
2 changes: 1 addition & 1 deletion implicit/cpu/topk.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ from cython.parallel import parallel, prange
cdef extern from "implicit/cpu/select.h" namespace "implicit" nogil:
cdef void select[T](const T * batch,
int batch_rows, int batch_columns, int k,
int * ids, T * distances) nogil except *
int * ids, T * distances) nogil except +


def topk(items, query, int k, item_norms=None, filter_query_items=None, filter_items=None, int num_threads=0):
Expand Down
2 changes: 1 addition & 1 deletion implicit/gpu/als.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ cdef extern from "implicit/gpu/als.h" namespace "implicit::gpu" nogil:
cdef cppclass LeastSquaresSolver:
LeastSquaresSolver() except +

void calculate_yty(const Matrix & Y, Matrix * YtY, float regularization) except *
void calculate_yty(const Matrix & Y, Matrix * YtY, float regularization) except +

void least_squares(const CSRMatrix & Cui, Matrix * X,
const Matrix & YtY, const Matrix & Y,
Expand Down
28 changes: 22 additions & 6 deletions implicit/gpu/als.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,25 +132,27 @@ def fit(self, user_items, show_progress=True, callback=None):

# invalidate cached norms and squared factors
self._item_norms = self._user_norms = None
self._item_norms_host = self._user_norms_host = None
self._YtY = self._XtX = None

Ciu = implicit.gpu.CSRMatrix(Ciu)
Cui = implicit.gpu.CSRMatrix(Cui)
X = self.user_factors
Y = self.item_factors
loss = None

self._YtY = implicit.gpu.Matrix.zeros(self.factors, self.factors)
self._XtX = implicit.gpu.Matrix.zeros(self.factors, self.factors)
_YtY = implicit.gpu.Matrix.zeros(self.factors, self.factors)
_XtX = implicit.gpu.Matrix.zeros(self.factors, self.factors)

log.debug("Running %i ALS iterations", self.iterations)
with tqdm(total=self.iterations, disable=not show_progress) as progress:
for iteration in range(self.iterations):
s = time.time()
self.solver.calculate_yty(Y, self._YtY, self.regularization)
self.solver.least_squares(Cui, X, self._YtY, Y, self.cg_steps)
self.solver.calculate_yty(Y, _YtY, self.regularization)
self.solver.least_squares(Cui, X, _YtY, Y, self.cg_steps)

self.solver.calculate_yty(X, self._XtX, self.regularization)
self.solver.least_squares(Ciu, Y, self._XtX, X, self.cg_steps)
self.solver.calculate_yty(X, _XtX, self.regularization)
self.solver.least_squares(Ciu, Y, _XtX, X, self.cg_steps)
progress.update(1)

if self.calculate_training_loss:
Expand Down Expand Up @@ -290,3 +292,17 @@ def to_cpu(self) -> implicit.cpu.als.AlternatingLeastSquares:
ret.user_factors = self.user_factors.to_numpy() if self.user_factors is not None else None
ret.item_factors = self.item_factors.to_numpy() if self.item_factors is not None else None
return ret

def __getstate__(self):
state = super().__getstate__()
state["_solver"] = None
state["_XtX"] = self._XtX.to_numpy() if self._XtX is not None else None
state["_YtY"] = self._YtY.to_numpy() if self._YtY is not None else None
return state

def __setstate__(self, state):
super().__setstate__(state)
if self._XtX is not None:
self._XtX = implicit.gpu.Matrix(self._XtX)
if self._YtY is not None:
self._YtY = implicit.gpu.Matrix(self._YtY)
4 changes: 4 additions & 0 deletions implicit/gpu/matrix.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ Vector<T>::Vector(size_t size, const T *host_data)
if (host_data) {
CHECK_CUDA(
cudaMemcpy(data, host_data, size * sizeof(T), cudaMemcpyHostToDevice));
} else {
CHECK_CUDA(cudaMemset(data, 0, size * sizeof(T)));
}
}

Expand Down Expand Up @@ -75,6 +77,8 @@ Matrix::Matrix(size_t rows, size_t cols, float *host_data, bool allocate)
if (host_data) {
CHECK_CUDA(cudaMemcpy(data, host_data, rows * cols * sizeof(float),
cudaMemcpyHostToDevice));
} else {
CHECK_CUDA(cudaMemset(data, 0, rows * cols * sizeof(float)));
}
} else {
data = host_data;
Expand Down
25 changes: 12 additions & 13 deletions implicit/gpu/matrix_factorization_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,21 +215,20 @@ def save(self, file):
self.to_cpu().save(file)

def __getstate__(self):
return {
"item_factors": self.item_factors.to_numpy() if self.item_factors else None,
"user_factors": self.user_factors.to_numpy() if self.user_factors else None,
}
state = self.__dict__.copy()
# _knn is unpickleable - the rest are cached attributes that don't need to be saved
for attr in ["_knn", "_user_norms", "_user_norms_host", "_item_norms", "_item_norms_host"]:
state[attr] = None
state["item_factors"] = self.item_factors.to_numpy() if self.item_factors else None
state["user_factors"] = self.user_factors.to_numpy() if self.user_factors else None
return state

def __setstate__(self, state):
# default initialize members
self.__init__()
item_factors = state["item_factors"]
if item_factors is not None:
self.item_factors = implicit.gpu.Matrix(item_factors)

user_factors = state["user_factors"]
if user_factors is not None:
self.user_factors = implicit.gpu.Matrix(user_factors)
self.__dict__.update(state)
if self.item_factors is not None:
self.item_factors = implicit.gpu.Matrix(self.item_factors)
if self.user_factors is not None:
self.user_factors = implicit.gpu.Matrix(self.user_factors)


def check_random_state(random_state):
Expand Down
21 changes: 18 additions & 3 deletions tests/als_test.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pickle
import unittest

import numpy as np
Expand Down Expand Up @@ -31,16 +32,15 @@ def test_zero_iterations_with_loss(use_gpu):
model = AlternatingLeastSquares(
factors=128, use_gpu=use_gpu, iterations=0, calculate_training_loss=True
)
model.fit(csr_matrix(np.ones((10, 10))))
model.fit(csr_matrix(np.ones((10, 10))), show_progress=False)


@pytest.mark.skipif(not HAS_CUDA, reason="test requires gpu")
def test_recalculate_after_cpu_conversion():
# test out issue reported in https://github.com/benfred/implicit/issues/597
user_items = get_checker_board(50)

model = AlternatingLeastSquares(factors=2, use_gpu=True)
model.fit(user_items)
model.fit(user_items, show_progress=False)
original_ids, _ = model.recommend(0, user_items=user_items[0], recalculate_user=True)

model = model.to_cpu().to_gpu()
Expand All @@ -49,6 +49,21 @@ def test_recalculate_after_cpu_conversion():
assert_array_equal(ids, original_ids)


@pytest.mark.parametrize("use_gpu", [True, False] if HAS_CUDA else [False])
def test_recalculate_after_pickle(use_gpu):
user_items = get_checker_board(10)
model = AlternatingLeastSquares(factors=2, use_gpu=use_gpu, regularization=0.1)
model.fit(user_items, show_progress=False)
model._XtX = model._YtY = None

original_ids, _ = model.recommend(0, user_items=user_items[0], recalculate_user=True)

model = pickle.loads(pickle.dumps(model))
ids, _ = model.recommend(0, user_items=user_items[0], recalculate_user=True)

assert_array_equal(ids, original_ids)


@pytest.mark.parametrize("use_native", [True, False])
def test_cg_nan(use_native):
# test issue with CG code that was causing NaN values in output:
Expand Down

0 comments on commit 7948029

Please sign in to comment.