diff --git a/.gitignore b/.gitignore index 778fecb..6bca713 100644 --- a/.gitignore +++ b/.gitignore @@ -46,3 +46,10 @@ test_math/*.c test_math/failing.py test_math/run .Rproj.user + +example/*.p +example/*.csv +example/ml* +example/ml-1m/* +example/ml-25m/* +example/cmfrec/* diff --git a/cmfrec/__init__.py b/cmfrec/__init__.py index fdc7191..e70a18b 100644 --- a/cmfrec/__init__.py +++ b/cmfrec/__init__.py @@ -1,6 +1,6 @@ from . import wrapper_double, wrapper_float import numpy as np, pandas as pd -from scipy.sparse import coo_matrix, csr_matrix, csc_matrix, issparse, isspmatrix_coo, isspmatrix_csr, isspmatrix_csc +from scipy.sparse import csr_array, csc_array, issparse import multiprocessing import ctypes import warnings @@ -10,6 +10,13 @@ "MostPopular", "ContentBased", "CMF_imputer"] +def _is_csr(x): + return issparse(x) and (x.format == "csr") +def _is_csc(x): + return issparse(x) and (x.format == "csc") +def _is_coo(x): + return issparse(x) and (x.format == "coo") + ### TODO: this module should move from doing operations in Python to ### using the new designated C functions for each type of prediction. @@ -67,7 +74,7 @@ def _take_params(self, implicit=False, alpha=40., downweight=False, maxiter=400, niter=10, parallelize="separate", corr_pairs=4, NA_as_zero=False, NA_as_zero_user=False, NA_as_zero_item=False, precompute_for_predictions=True, use_float=False, - random_state=1, verbose=True, + random_state=1, verbose=False, print_every=10, handle_interrupt=True, produce_dicts=False, nthreads=-1, n_jobs=None): assert method in ["als", "lbfgs"] @@ -88,21 +95,18 @@ def _take_params(self, implicit=False, alpha=40., downweight=False, if ((max(k_user, k_item) + k + k_main + max(user_bias, item_bias))**2) > np.iinfo(ctypes.c_int).max: raise ValueError("Number of factors is too large.") + dtype = ctypes.c_float if use_float else ctypes.c_double lambda_ = float(lambda_) if isinstance(lambda_, int) else lambda_ - if isinstance(lambda_, (list, tuple, pd.Series)): - lambda_ = np.array(lambda_) - if isinstance(lambda_, np.ndarray): - lambda_ = lambda_.reshape(-1) + if not isinstance(lambda_, float): + lambda_ = np.require(lambda_, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) assert lambda_.shape[0] == 6 assert np.all(lambda_ >= 0.) else: assert isinstance(lambda_, float) and lambda_ >= 0. l1_lambda = float(l1_lambda) if isinstance(l1_lambda, int) else l1_lambda - if isinstance(l1_lambda, (list, tuple, pd.Series)): - l1_lambda = np.array(l1_lambda) - if isinstance(l1_lambda, np.ndarray): - l1_lambda = l1_lambda.reshape(-1) + if not isinstance(l1_lambda, float): + l1_lambda = np.require(l1_lambda, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) assert l1_lambda.shape[0] == 6 assert np.all(l1_lambda >= 0.) else: @@ -122,6 +126,8 @@ def _take_params(self, implicit=False, alpha=40., downweight=False, nthreads = multiprocessing.cpu_count() + 1 + nthreads if nthreads is None: nthreads = 1 + if isinstance(nthreads, float): + nthreads = int(nthreads) assert isinstance(nthreads, int) and nthreads > 0 if (nthreads > 1) and (not wrapper_double._get_has_openmp()): @@ -247,11 +253,9 @@ def _take_params(self, implicit=False, alpha=40., downweight=False, self._k_main_col = self.k_main if isinstance(self.lambda_, np.ndarray): - if self.lambda_.dtype != self.dtype_: - self.lambda_ = self.lambda_.astype(self.dtype_) + self.lambda_ = np.require(self.lambda_, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) if isinstance(self.l1_lambda, np.ndarray): - if self.l1_lambda.dtype != self.dtype_: - self.l1_lambda = self.l1_lambda.astype(self.dtype_) + self.l1_lambda = np.require(self.l1_lambda, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) self._reset() @@ -337,26 +341,18 @@ def _take_params_offsets(self, k_sec=0, k_main=0, add_intercepts=True): def _append_NAs(self, U, m_u, p, append_U): U_new = np.repeat(np.nan, m_u*p).reshape((m_u, p)) - if U_new.dtype != self.dtype_: - U_new = U_new.astype(U.dtype) - if not U_new.flags["C_CONTIGUOUS"]: - U_new = np.ascontiguousarray(U_new) + U_new = np.require(U_new, dtype=self.dtype_, requirements=["C_CONTIGUOUS", "ENSUREARRAY"]) U_new[np.setdiff1d(np.arange(m_u), append_U), :] = U if U_new.dtype != self.dtype_: - U_new = U_new.astype(U.dtype) + U_new = np.require(U_new, dtype=self.dtype_, requirements=["C_CONTIGUOUS", "ENSUREARRAY"]) return U_new def _decompose_coo(self, X): - row = X.row - col = X.col - val = X.data - if row.dtype != ctypes.c_int: - row = row.astype(ctypes.c_int) - if col.dtype != ctypes.c_int: - col = col.astype(ctypes.c_int) - if val.dtype != self.dtype_: - val = val.astype(self.dtype_) - return row, col, val + return( + np.require(X.row, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]), + np.require(X.col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]), + np.require(X.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]), + ) def _process_U_arr(self, U): Urow = np.empty(0, dtype=ctypes.c_int) @@ -366,20 +362,16 @@ def _process_U_arr(self, U): Ucols = np.empty(0, dtype=object) m = 0 p = 0 - if issparse(U) and not isspmatrix_coo(U): + if issparse(U) and not (U.format == "coo"): U = U.tocoo() - if isspmatrix_coo(U): + if _is_coo(U): Urow, Ucol, Uval = self._decompose_coo(U) m, p = U.shape elif U is not None: if isinstance(U, pd.DataFrame): - Ucols = U.columns.to_numpy() - U = U.to_numpy() - if not U.flags["C_CONTIGUOUS"]: - U = np.ascontiguousarray(U) - if U.dtype != self.dtype_: - U = U.astype(self.dtype_) - Uarr = U + Ucols = U.columns.to_numpy(copy=True) + U = U.to_numpy(copy=False, dtype=self.dtype_) + Uarr = np.require(U, dtype=self.dtype_, requirements=["C_CONTIGUOUS", "ENSUREARRAY"]) m, p = Uarr.shape return Urow, Ucol, Uval, Uarr, Ucols, m, p @@ -391,18 +383,23 @@ def _convert_ids(self, X, U, U_bin, col="UserId"): append_Ub = np.empty(0, dtype=object) msg = "'X' and side info have no IDs in common." if (U is not None) and (U_bin is not None): - user_ids1 = np.intersect1d(U[col].to_numpy(), X[col].to_numpy()) - user_ids2 = np.intersect1d(U_bin[col].to_numpy(), X[col].to_numpy()) - user_ids3 = np.intersect1d(U_bin[col].to_numpy(), U[col].to_numpy()) + Xcol = X[col].to_numpy(copy=False) + Ucol = U[col].to_numpy(copy=False) + Ubcol = U_bin[col].to_numpy(copy=False) + + + user_ids1 = np.intersect1d(Ucol, Xcol) + user_ids2 = np.intersect1d(Ubcol, Xcol) + user_ids3 = np.intersect1d(Ubcol, Ucol) if (user_ids1.shape[0] == 0) and (user_ids2.shape[0] == 0): raise ValueError(msg) user_ids = np.intersect1d(user_ids1, user_ids2) - u_not_x = np.setdiff1d(U[col].to_numpy(), X[col].to_numpy()) - x_not_u = np.setdiff1d(X[col].to_numpy(), U[col].to_numpy()) - b_not_x = np.setdiff1d(U_bin[col].to_numpy(), X[col].to_numpy()) - x_not_b = np.setdiff1d(X[col].to_numpy(), U_bin[col].to_numpy()) - b_not_u = np.setdiff1d(U_bin[col].to_numpy(), U[col].to_numpy()) - u_not_b = np.setdiff1d(U[col].to_numpy(), U_bin[col].to_numpy()) + u_not_x = np.setdiff1d(Ucol, Xcol) + x_not_u = np.setdiff1d(Xcol, Ucol) + b_not_x = np.setdiff1d(Ubcol, Xcol) + x_not_b = np.setdiff1d(Xcol, Ubcol) + b_not_u = np.setdiff1d(Ubcol, Ucol) + u_not_b = np.setdiff1d(Ucol, Ubcol) ### There can be cases in which the sets are disjoint, ### and will need to add NAs to one of the inputs. @@ -415,32 +412,44 @@ def _convert_ids(self, X, U, U_bin, col="UserId"): user_ids = user_ids else: if u_not_b.shape[0] >= b_not_u.shape[0]: - user_ids = np.r_[user_ids, user_ids1, X[col].to_numpy(), user_ids3, U[col].to_numpy(), U_bin[col].to_numpy()] + user_ids = np.r_[user_ids, user_ids1, Xcol, user_ids3, Ucol, Ubcol] append_U = x_not_u append_Ub = np.r_[x_not_b, u_not_b] else: - user_ids = np.r_[user_ids, user_ids2, X[col].to_numpy(), user_ids3, U_bin[col].to_numpy(), U[col].to_numpy()] + user_ids = np.r_[user_ids, user_ids2, Xcol, user_ids3, Ubcol, Ucol] append_U = np.r_[x_not_u, b_not_u] append_Ub = x_not_b + # TODO: move away from pandas for these operations _, user_mapping_ = pd.factorize(user_ids) - X = X.assign(**{col : pd.Categorical(X[col], user_mapping_).codes}) - if X[col].dtype != ctypes.c_int: - X = X.assign(**{col : X[col].astype(ctypes.c_int)}) - U = U.assign(**{col : pd.Categorical(U[col], user_mapping_).codes}) - if U[col].dtype != ctypes.c_int: - U = U.assign(**{col : U[col].astype(ctypes.c_int)}) - U_bin = U_bin.assign(**{col : pd.Categorical(U_bin[col], user_mapping_).codes}) - if U_bin[col].dtype != ctypes.c_int: - U_bin = U_bin.assign(**{col : U_bin[col].astype(ctypes.c_int)}) + X = X.assign(**{ + col : pd.Categorical(Xcol, user_mapping_, copy=False).codes.astype(ctypes.c_int) + }) + U = U.assign(**{ + col : pd.Categorical(Ucol, user_mapping_, copy=False).codes.astype(ctypes.c_int) + }) + U_bin = U_bin.assign(**{ + col : pd.Categorical(Ubcol, user_mapping_, copy=False).codes.astype(ctypes.c_int) + }) + user_mapping_ = np.require(user_mapping_, requirements=["ENSUREARRAY"]),reshape(-1) if append_U.shape[0]: - append_U = pd.Categorical(np.unique(append_U), user_mapping_).codes.astype(ctypes.c_int) + append_U = pd.Categorical( + np.unique(append_U), + user_mapping_, + copy=False, + ).codes append_U = np.sort(append_U) + append_U = np.require(append_U, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) if append_Ub.shape[0]: - append_Ub = pd.Categorical(np.unique(append_Ub), user_mapping_).codes.astype(ctypes.c_int) + append_Ub = pd.Categorical( + np.unique(append_Ub), + user_mapping_, + copy=False, + ).codes append_Ub = np.sort(append_Ub) + append_Ub = np.require(append_Ub, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) else: if (U is None) and (U_bin is not None): @@ -448,12 +457,15 @@ def _convert_ids(self, X, U, U_bin, col="UserId"): swapped = True if (U is not None): - user_ids = np.intersect1d(U[col].to_numpy(), X[col].to_numpy()) + Xcol = X[col].to_numpy(copy=False) + Ucol = U[col].to_numpy(copy=False) + + user_ids = np.intersect1d(Ucol, Xcol) if user_ids.shape[0] == 0: raise ValueError(msg) - u_not_x = np.setdiff1d(U[col].to_numpy(), X[col].to_numpy()) - x_not_u = np.setdiff1d(X[col].to_numpy(), U[col].to_numpy()) + u_not_x = np.setdiff1d(Ucol, Xcol) + x_not_u = np.setdiff1d(Xcol, Ucol) if (u_not_x.shape[0]) or (x_not_u.shape[0]): ### Case0: both have the same entries ### This is the ideal situation @@ -469,29 +481,40 @@ def _convert_ids(self, X, U, U_bin, col="UserId"): user_ids = np.r_[user_ids, u_not_x] ### Case3: both have IDs that the others don't else: - user_ids = np.r_[user_ids, X[col].to_numpy(), U[col].to_numpy()] + user_ids = np.r_[user_ids, Xcol, Ucol] append_U = x_not_u _, user_mapping_ = pd.factorize(user_ids) - if not isinstance(user_mapping_, np.ndarray): - user_mapping_ = user_mapping_.to_numpy() - X = X.assign(**{col : pd.Categorical(X[col], user_mapping_).codes}) - if X[col].dtype != ctypes.c_int: - X = X.assign(**{col : X[col].astype(ctypes.c_int)}) - U = U.assign(**{col : pd.Categorical(U[col], user_mapping_).codes}) - if U[col].dtype != ctypes.c_int: - U = U.assign(**{col : U[col].astype(ctypes.c_int)}) + X = X.assign(**{ + col : pd.Categorical( + Xcol, user_mapping_, copy=False + ) + .codes + .astype(dtype=ctypes.c_int) + }) + U = U.assign(**{ + col : pd.Categorical( + Ucol, user_mapping_, copy=False + ) + .codes + .astype(dtype=ctypes.c_int) + }) if append_U.shape[0]: - append_U = pd.Categorical(append_U, user_mapping_).codes.astype(ctypes.c_int) + append_U = pd.Categorical( + append_U, user_mapping_, copy=False + ).codes append_U = np.sort(append_U) + append_U = np.require(append_U, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) + user_mapping_ = np.require(user_mapping_, requirements=["ENSUREARRAY"]).reshape(-1) else: - X_col, user_mapping_ = pd.factorize(X[col].to_numpy()) - X = X.assign(**{col : X_col}) + Xcol = X[col].to_numpy(copy=False) + Xcol, user_mapping_ = pd.factorize(Xcol) + Xcol = np.require(Xcol, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) + X = X.assign(**{col : Xcol}) if X[col].dtype != ctypes.c_int: X = X.assign(**{col : X[col].astype(ctypes.c_int)}) - if not isinstance(user_mapping_, np.ndarray): - user_mapping_ = user_mapping_.to_numpy() + user_mapping_ = np.require(user_mapping_, requirements=["ENSUREARRAY"]).reshape(-1) if swapped: U, U_bin = U_bin, U @@ -508,25 +531,22 @@ def _process_U_df(self, U, is_I=False, df_name="U"): m = 0 p = 0 if U is not None: - if "ColumnId" in U.columns.values: - Urow = U[cl_take].astype(ctypes.c_int).to_numpy() - Ucol = U.ColumnId.astype(ctypes.c_int).to_numpy() - if "Value" not in U.columns.values: + if "ColumnId" in U.columns: + Urow = U[cl_take].to_numpy(copy=False, dtype=ctypes.c_int) + Ucol = U["ColumnId"].to_numpy(copy=False, dtype=ctypes.c_int) + if "Value" not in U.columns: msg = "If passing sparse '%s', must have column 'Value'." msg = msg % df_name raise ValueError(msg) - Uval = U.Value.astype(self.dtype_).to_numpy() + Uval = U["Value"].to_numpy(copy=False, dtype=self.dtype_) m = int(Urow.max() + 1) p = int(Ucol.max() + 1) else: U = U.sort_values(cl_take) - Uarr = U[[cl for cl in U.columns.values if cl != cl_take]] - Ucols = Uarr.columns.to_numpy() - Uarr = Uarr.to_numpy() - if not Uarr.flags["C_CONTIGUOUS"]: - Uarr = np.ascontiguousarray(Uarr) - if Uarr.dtype != self.dtype_: - Uarr = Uarr.astype(self.dtype_) + Uarr = U[[cl for cl in U.columns if cl != cl_take]] + Ucols = Uarr.columns.to_numpy(copy=True) + Uarr = Uarr.to_numpy(copy=False, dtype=self.dtype_) + Uarr = np.require(Uarr, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) m, p = Uarr.shape return Urow, Ucol, Uval, Uarr, Ucols, m, p @@ -556,7 +576,7 @@ def _process_new_U(self, U, U_col, U_val, U_bin, is_I=False): raise ValueError("Model was not fit to %s data." % name) if isinstance(U, pd.DataFrame) and Cols.shape[0]: U = U[Cols] - U = np.array(U).reshape(-1).astype(self.dtype_) + U = np.require(U, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) if U.shape[0] != Mat.shape[0]: raise ValueError("Dimensions of %s don't match with earlier data." % letter) @@ -568,7 +588,7 @@ def _process_new_U(self, U, U_col, U_val, U_bin, is_I=False): raise ValueError("Model was not fit to %s binary data." % name) if isinstance(U_bin, pd.DataFrame) and (ColsBin.shape[0]): U_bin = U_bin[ColsBin] - U_bin = np.array(U_bin).reshape(-1).astype(self.dtype_) + U_bin = np.require(U_bin, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) if U_bin.shape[0] != MatBin.shape[0]: raise ValueError("Dimensions of %s_bin don't match with earlier data." % letter) @@ -576,12 +596,22 @@ def _process_new_U(self, U, U_col, U_val, U_bin, is_I=False): U_bin = np.empty(0, dtype=self.dtype_) ### if U_col is not None: + U_val = np.require( + U_val, + dtype=self.dtype_, + requirements=["ENSUREARRAY", "C_CONTIGUOUS"] + ).reshape(-1) + U_col = np.require( + U_col, + dtype=ctypes.c_int if not not self.reindex_ else None, + requirements=["ENSUREARRAY", "C_CONTIGUOUS"] + ).reshape(-1) + if U_val.shape[0] != U_col.shape[0]: + raise ValueError("'%s_col' and '%s_val' must have the same number of entries." % (letter, letter)) + if Mat.shape[0] == 0: raise ValueError("Model was not fit to %s data." % name) - U_val = np.array(U_val).reshape(-1).astype(self.dtype_) if U_val.shape[0] == 0: - if np.array(U_col).shape[0] > 0: - raise ValueError("'%s_col' and '%s_val' must have the same number of entries." % (letter, letter)) U_col = np.empty(0, dtype=ctypes.c_int) U_val = np.empty(0, dtype=self.dtype_) else: @@ -592,12 +622,13 @@ def _process_new_U(self, U, U_col, U_val, U_bin, is_I=False): except Exception: raise ValueError("Sparse inputs cannot contain missing values.") else: - U_col = pd.Categorical(U_col, mapping).codes.astype(ctypes.c_int) + U_col = pd.Categorical(U_col, mapping).codes + U_col = np.require(U_col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) if np.any(U_col < 0): raise ValueError("Sparse inputs cannot contain missing values.") - U_col = U_col.astype(ctypes.c_int) + U_col = np.require(U_col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) else: - U_col = np.array(U_col).reshape(-1).astype(ctypes.c_int) + U_col = np.require(U_col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) imin, imax = U_col.min(), U_col.max() if np.isnan(imin) or np.isnan(imax): raise ValueError("Sparse inputs cannot contain missing values.") @@ -633,37 +664,31 @@ def _process_new_U_2d(self, U, is_I=False, allow_csr=False): msg += "as the data passed to 'fit'." raise ValueError(msg % letter) - if issparse(U) and (not isspmatrix_coo(U)) and (not isspmatrix_csr(U)): - U = U.tocoo() - elif isspmatrix_csr(U) and not allow_csr: - U = U.tocoo() + if issparse(U): + if (U.format not in ["coo", "csr"]): + U = U.tocoo() + elif (U.format == "csr") and not allow_csr: + U = U.tocoo() if isinstance(U, pd.DataFrame): - if col_id in U.columns.values: + if col_id in U.columns: warnings.warn("'%s' not meaningful for new inputs." % col_id) if Cols.shape[0]: U = U[Cols] - Uarr = U.to_numpy() - Uarr = np.ascontiguousarray(Uarr) - if Uarr.dtype != self.dtype_: - Uarr = Uarr.astype(self.dtype_) - - elif isspmatrix_coo(U): - Urow = U.row.astype(ctypes.c_int) - Ucol = U.col.astype(ctypes.c_int) - Uval = U.data.astype(self.dtype_) - elif isspmatrix_csr(U): + Uarr = np.require(U, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) + + elif _is_coo(U): + Urow = np.require(U.row, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) + Ucol = np.require(U.col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) + Uval = np.require(U.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) + elif _is_csr(U): if not allow_csr: raise ValueError("Unexpected error.") - Ucsr_p = U.indptr.astype(ctypes.c_size_t) - Ucsr_i = U.indices.astype(ctypes.c_int) - Ucsr = U.data.astype(self.dtype_) + Ucsr_p = np.require(U.indptr, dtype=ctypes.c_size_t, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) + Ucsr_i = np.require(U.indices, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) + Ucsr = np.require(U.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) elif isinstance(U, np.ndarray): - if not U.flags["C_CONTIGUOUS"]: - U = np.ascontiguousarray(U) - if U.dtype != self.dtype_: - U = U.astype(self.dtype_) - Uarr = U + Uarr = np.require(U, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) elif U is None: pass else: @@ -694,19 +719,13 @@ def _process_new_Ub_2d(self, U_bin, is_I=False): raise ValueError(msg % letter) if isinstance(U_bin, pd.DataFrame): - if col_id in U_bin.columns.values: + if col_id in U_bin.columns: warnings.warn("'%s' not meaningful for new inputs." % col_id) if Cols.shape[0]: U_bin = U_bin[Cols] - Ub_arr = U_bin.to_numpy() - Ub_arr = np.ascontiguousarray(Ub_arr) - if Ub_arr.dtype != self.dtype_: - Ub_arr = Ub_arr.astype(self.dtype_) + Ub_arr = np.require(U_bin, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) elif isinstance(Ub_arr, np.ndarray): - if not Ub_arr.flags["C_CONTIGUOUS"]: - Ub_arr = np.ascontiguousarray(Ub_arr) - if Ub_arr.dtype != self.dtype_: - Ub_arr = Ub_arr.astype(self.dtype_) + Ub_arr = np.require(Ub_arr, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) elif Ub_arr is None: pass else: @@ -730,62 +749,54 @@ def _process_new_X_2d(self, X, W=None): W_sp = np.empty(0, dtype=self.dtype_) m, n = X.shape - if issparse(X) and (not isspmatrix_coo(X)) and (not isspmatrix_csr(X)): + # TODO: why is this needed? should it error out with CSC or is it somehow used internally? + if issparse(X) and (not (X.format == "coo")) and (not (X.format == "csr")): if (W is not None) and (not issparse(W)): - if not isinstance(W, np.ndarray): - W = np.array(W).reshape(-1) - if W.shape[0] != X.nnz: - raise ValueError("'X' and 'W' have different number of entries.") - if isspmatrix_csc(X): - W = csc_matrix((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1])) + W = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) + if W.shape[0] != X.data.shape[0]: + raise ValueError("'X' and 'W' have different number of entries.") + if (X.format == "csc"): + W = csc_array((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1]), dtype=self.dtype_) W = W.tocoo() else: raise ValueError("Must pass 'X' as SciPy sparse COO if there are weights.") X = X.tocoo() - if issparse(W) and (not isspmatrix_coo(W)) and (not isspmatrix_csr(W)): + if issparse(W) and (W.format not in ["coo", "csr"]): W = W.tocoo() - if (isspmatrix_coo(X) != isspmatrix_coo(W)): - if not isspmatrix_coo(X): + if issparse(X) and issparse(W) and ((X.format == "coo") != (W.format == "coo")): + if not _is_coo(X): X = X.tocoo() - if not isspmatrix_coo(W): + if not _is_coo(W): W = W.tocoo() if issparse(W): - W = W.data + W = np.require(W.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) - if isspmatrix_coo(X): - Xrow = X.row.astype(ctypes.c_int) - Xcol = X.col.astype(ctypes.c_int) - Xval = X.data.astype(self.dtype_) + if _is_coo(X): + Xrow = np.require(X.row, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) + Xcol = np.require(X.col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) + Xval = np.require(X.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) if W is not None: - W_sp = np.array(W).reshape(-1).astype(self.dtype_) + W_sp = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) if W_sp.shape[0] != Xval.shape[0]: msg = "'W' must have the same number of non-zero entries " msg += "as 'X'." raise ValueError(msg) - elif isspmatrix_csr(X): - Xcsr_p = X.indptr.astype(ctypes.c_size_t) - Xcsr_i = X.indices.astype(ctypes.c_int) - Xcsr = X.data.astype(self.dtype_) + elif _is_csr(X): + Xcsr_p = np.require(X.indptr, dtype=ctypes.c_size_t, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) + Xcsr_i = np.require(X.indices, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) + Xcsr = np.require(X.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) if W is not None: - W_sp = np.array(W).reshape(-1).astype(self.dtype_) + W_sp = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) if W_sp.shape[0] != Xcsr.shape[0]: msg = "'W' must have the same number of non-zero entries " msg += "as 'X'." raise ValueError(msg) elif isinstance(X, np.ndarray): - if not X.flags["C_CONTIGUOUS"]: - X = np.ascontiguousarray(X) - if X.dtype != self.dtype_: - X = X.astype(self.dtype_) - Xarr = X + Xarr = np.require(X, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) if W is not None: assert W.shape[0] == X.shape[0] assert W.shape[1] == X.shape[1] - if not W.flags["C_CONTIGUOUS"]: - W = np.ascontiguousarray(W) - if W.dtype != self.dtype_: - W = W.astype(self.dtype_) - W_dense = W + W_dense = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) else: raise ValueError("'X' must be a SciPy CSR or COO matrix, or NumPy array.") @@ -804,30 +815,28 @@ def _process_users_items(self, user, item, include, exclude, allows_no_item=True raise ValueError("'include' and 'exclude' should not contain missing values.") if include is not None and exclude is not None: raise ValueError("Cannot pass 'include' and 'exclude' together.") - include = np.array(include).reshape(-1) if include is not None \ - else np.empty(0, dtype=ctypes.c_int) - exclude = np.array(exclude).reshape(-1) if exclude is not None \ - else np.empty(0, dtype=ctypes.c_int) - - if isinstance(user, (list, tuple)) : - user = np.array(user) - if isinstance(item, (list, tuple)): - item = np.array(item) - if isinstance(user, pd.Series): - user = user.to_numpy() - if isinstance(item, pd.Series): - item = item.to_numpy() + + if include is not None: + include = np.require(include, requirements=["ENSUREARRAY"]).reshape(-1) + else: + include = np.empty(0, dtype=ctypes.c_int) + if exclude is not None: + exclude = np.require(exclude, requirements=["ENSUREARRAY"]).reshape(-1) + else: + exclude = np.empty(0, dtype=ctypes.c_int) + + if not np.isscalar(user): + user = np.require(user, requirements=["ENSUREARRAY"]).reshape(-1) + if not np.isscalar(item): + item = np.require(item, requirements=["ENSUREARRAY"]).reshape(-1) if user is not None: if isinstance(user, np.ndarray): - if len(user.shape) > 1: - user = user.reshape(-1) assert user.shape[0] > 0 if self.reindex_: if user.shape[0] > 1: user = pd.Categorical(user, self.user_mapping_).codes - if user.dtype != ctypes.c_int: - user = user.astype(ctypes.c_int) + user = np.require(user, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) else: if len(self.user_dict_): try: @@ -850,14 +859,11 @@ def _process_users_items(self, user, item, include, exclude, allows_no_item=True if item is not None: if isinstance(item, np.ndarray): - if len(item.shape) > 1: - item = item.reshape(-1) assert item.shape[0] > 0 if self.reindex_: if item.shape[0] > 1: item = pd.Categorical(item, self.item_mapping_).codes - if item.dtype != ctypes.c_int: - item = item.astype(ctypes.c_int) + item = np.require(item, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) else: if len(self.item_dict_): try: @@ -893,9 +899,7 @@ def _process_users_items(self, user, item, include, exclude, allows_no_item=True if np.any(include < 0): raise ValueError(msg % "include") - if include.dtype != ctypes.c_int: - include = include.astype(ctypes.c_int) - include = include.reshape(-1) + include = np.require(include, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) if exclude.shape[0]: if len(self.item_dict_): try: @@ -904,36 +908,31 @@ def _process_users_items(self, user, item, include, exclude, allows_no_item=True raise ValueError(msg % "exclude") else: exclude = pd.Categorical(exclude, self.item_mapping_).codes - if exclude.dtype != ctypes.c_int: - exclude = exclude.astype(ctypes.c_int) if np.any(exclude < 0): raise ValueError(msg % "exclude") - if exclude.dtype != ctypes.c_int: - exclude = exclude.astype(ctypes.c_int) - exclude = exclude.reshape(-1) + exclude = np.require(exclude, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) else: msg = "'%s' entries must be within the range of the %s (%s)" msg += " of the data that was passed to 'fit'." if include.shape[0]: + include = np.require(include, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) imin, imax = include.min(), include.max() if (imin < 0) or (imax >= self._B_pred.shape[0]): raise ValueError(msg % ("include", "items", "columns")) if exclude.shape[0]: + exclude = np.require(exclude, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) emin, emax = exclude.min(), exclude.max() if (emin < 0) or (emax >= self._B_pred.shape[0]): raise ValueError(msg % ("exclude", "items", "columns")) if user is not None: - user = user.astype(ctypes.c_int) + user = np.require(user, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) if item is not None: - item = item.astype(ctypes.c_int) - if include.dtype != ctypes.c_int: - include = include.astype(ctypes.c_int) - if exclude.dtype != ctypes.c_int: - exclude = exclude.astype(ctypes.c_int) - + item = np.require(item, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) + include = np.require(include, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) + exclude = np.require(exclude, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) return user, item, include, exclude @@ -945,29 +944,27 @@ def _fit_common(self, X, U=None, I=None, U_bin=None, I_bin=None, W=None, self._reset() - if issparse(X) and (not isspmatrix_coo(X)): + if issparse(X) and (not (X.format == "coo")): if (W is not None) and (not issparse(W)): - if isspmatrix_csr(X): - if not isinstance(W, np.ndarray): - W = np.array(W).reshape(-1) - if W.shape[0] != X.nnz: + if (X.format == "csr"): + W = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) + if W.shape[0] != X.data.shape[0]: raise ValueError("'X' and 'W' have different number of entries.") - W = csr_matrix((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1])) + W = csr_array((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1]), dtype=self.dtype_) W = W.tocoo() - elif isspmatrix_csc(X): - if not isinstance(W, np.ndarray): - W = np.array(W).reshape(-1) - if W.shape[0] != X.nnz: + elif (X.format == "csc"): + W = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) + if W.shape[0] != X.data.shape[0]: raise ValueError("'X' and 'W' have different number of entries.") - W = csc_matrix((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1])) + W = csc_array((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1]), dtype=self.dtype_) W = W.tocoo() else: raise ValueError("Must pass 'X' as SciPy COO if passing weights.") X = X.tocoo() - if issparse(W) and (not isspmatrix_coo(W)): + if issparse(W) and (not (W.format == "coo")): W = W.tocoo() if issparse(W): - W = W.data + W = np.require(W.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) if isinstance(X, pd.DataFrame): msg = "If passing 'X' as DataFrame, '%s' must also be a DataFrame." @@ -985,45 +982,36 @@ def _fit_common(self, X, U=None, I=None, U_bin=None, I_bin=None, W=None, msg += "called 'Weight'." raise ValueError(msg) - assert "UserId" in X.columns.values - assert "ItemId" in X.columns.values - if (self._implicit) and ("Rating" in X.columns.values) and ("Value" not in X.columns.values): - X = X.rename(columns={"Rating":"Value"}, copy=False, inplace=False) + assert "UserId" in X.columns + assert "ItemId" in X.columns + if (self._implicit) and ("Rating" in X.columns) and ("Value" not in X.columns): + X = X.rename(columns={"Rating":"Value"}, copy=False) if self._implicit: - assert "Value" in X.columns.values + assert "Value" in X.columns else: - assert "Rating" in X.columns.values + assert "Rating" in X.columns if U is not None: - assert "UserId" in U.columns.values + assert "UserId" in U.columns if I is not None: - assert "ItemId" in I.columns.values + assert "ItemId" in I.columns if U_bin is not None: - assert "UserId" in U_bin.columns.values + assert "UserId" in U_bin.columns if I_bin is not None: - assert "ItemId" in I_bin.columns.values + assert "ItemId" in I_bin.columns X, U, U_bin, self.user_mapping_, append_U, append_Ub = self._convert_ids(X, U, U_bin, "UserId") X, I, I_bin, self.item_mapping_, append_I, append_Ib = self._convert_ids(X, I, I_bin, "ItemId") - Xrow = X.UserId.to_numpy() - Xcol = X.ItemId.to_numpy() - if Xrow.dtype != ctypes.c_int: - Xrow = Xrow.astype(ctypes.c_int) - if Xcol.dtype != ctypes.c_int: - Xcol = Xcol.astype(ctypes.c_int) - if self._implicit: - Xval = X.Value.to_numpy() - else: - Xval = X.Rating.to_numpy() - if Xval.dtype != self.dtype_: - Xval = Xval.astype(self.dtype_) + Xrow = X["UserId"].to_numpy(copy=False, dtype=ctypes.c_int) + Xcol = X["ItemId"].to_numpy(copy=False, dtype=ctypes.c_int) + Xval = X["Value" if self._implicit else "Rating"].to_numpy(copy=False, dtype=self.dtype_) if Xval.shape[0] == 0: raise ValueError("'X' contains no non-zero entries.") Xarr = np.empty((0,0), dtype=self.dtype_) W_sp = np.empty(0, dtype=self.dtype_) - if "Weight" in X.columns.values: - W_sp = X.Weight.astype(self.dtype_).to_numpy() + if "Weight" in X.columns: + W_sp = X["Weight"].to_numpy(copy=False, dtype=self.dtype_) W_dense = np.empty((0,0), dtype=self.dtype_) Urow, Ucol, Uval, Uarr, self._U_cols, m_u, p = self._process_U_df(U, False, "U") @@ -1037,11 +1025,11 @@ def _fit_common(self, X, U=None, I=None, U_bin=None, I_bin=None, W=None, qbin = 0 msg = "Binary side info data cannot be passed in sparse format." if U_bin is not None: - if "ColumnId" in U_bin.columns.values: + if "ColumnId" in U_bin.columns: raise ValueError(msg) _1, _2, _3, Ub_arr, self._Ub_cols, m_ub, pbin = self._process_U_df(U_bin, False, "U_bin") if I_bin is not None: - if "ColumnId" in I_bin.columns.values: + if "ColumnId" in I_bin.columns: raise ValueError(msg) _1, _2, _3, Ib_arr, self._Ib_cols, n_ib, qbin = self._process_U_df(I_bin, True, "U_bin") @@ -1067,28 +1055,28 @@ def _fit_common(self, X, U=None, I=None, U_bin=None, I_bin=None, W=None, self.user_dict_ = {self.user_mapping_[i]:i for i in range(self.user_mapping_.shape[0])} self.item_dict_ = {self.item_mapping_[i]:i for i in range(self.item_mapping_.shape[0])} - elif isspmatrix_coo(X) or isinstance(X, np.ndarray): - if issparse(U) and not isspmatrix_coo(U): + elif _is_coo(X) or isinstance(X, np.ndarray): + if issparse(U) and not (U.format == "coo"): U = U.tocoo() - if issparse(I) and not isspmatrix_coo(I): + if issparse(I) and not (I.format == "coo"): I = I.tocoo() msg = " must be a Pandas DataFrame, NumPy array, or SciPy sparse COO matrix." msg_bin = " must be a Pandas DataFrame or NumPy array." - if U is not None and not (isinstance(U, (pd.DataFrame, np.ndarray)) or isspmatrix_coo(U)): + if U is not None and not (isinstance(U, (pd.DataFrame, np.ndarray)) or _is_coo(U)): raise ValueError("'U'" + msg) - if I is not None and not (isinstance(I, (pd.DataFrame, np.ndarray)) or isspmatrix_coo(I)): + if I is not None and not (isinstance(I, (pd.DataFrame, np.ndarray)) or _is_coo(I)): raise ValueError("'I'" + msg) if U_bin is not None and not isinstance(U_bin, (pd.DataFrame, np.ndarray)): raise ValueError("'U_bin'" + msg_bin) if I_bin is not None and not isinstance(I_bin, (pd.DataFrame, np.ndarray)): raise ValueError("'I_bin'" + msg_bin) if W is not None: - if isinstance(W, (list, pd.Series)): - W = np.array(W) - if (len(W.shape) > 1) and isspmatrix_coo(X): + if not issparse(W): + W = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY"]) + if (len(W.shape) > 1) and _is_coo(X): W = W.reshape(-1) if (not isinstance(W, np.ndarray)) or \ - (isspmatrix_coo(X) and W.shape[0] != X.nnz) or\ + (_is_coo(X) and W.shape[0] != X.data.shape[0]) or\ (isinstance(X, np.ndarray) and (W.shape[0] != X.shape[0] or W.shape[1] != X.shape[1])): raise ValueError("'W' must be an array with the same number of entries as 'X'.") @@ -1107,14 +1095,14 @@ def _fit_common(self, X, U=None, I=None, U_bin=None, I_bin=None, W=None, W_sp = np.empty(0, dtype=self.dtype_) W_dense = np.empty((0,0), dtype=self.dtype_) if W is not None: - if issparse(W) and not isspmatrix_coo(W): + if issparse(W) and not (W.format == "coo"): W = W.tocoo() if issparse(W): - W = W.data - if isspmatrix_coo(X): + W = np.require(W.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) + if _is_coo(X): W_sp = W.astype(self.dtype_) else: - W_dense = W.astype(self.dtype_) + W_dense = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) self.reindex_ = False @@ -1127,7 +1115,7 @@ def _fit_common(self, X, U=None, I=None, U_bin=None, I_bin=None, W=None, else: m = int(Xrow.max() + 1) n = int(Xcol.max() + 1) - if isspmatrix_coo(X): + if _is_coo(X): m = max(m, X.shape[0]) n = max(n, X.shape[1]) if enforce_same_shape: @@ -1230,11 +1218,12 @@ def _predict(self, user=None, a_vec=None, a_bias=0., item=None): if self._only_prediction_info: raise ValueError("Cannot use this function after dropping non-essential matrices.") + user_was_not_None = not (user is None) user, item, _1, _2 = self._process_users_items(user, item, None, None) c_funs = wrapper_float if self.use_float else wrapper_double - if user is not None: + if user_was_not_None: assert user.shape[0] == item.shape[0] if user.shape[0] == 1: @@ -1268,8 +1257,8 @@ def _predict(self, user=None, a_vec=None, a_bias=0., item=None): self.user_bias_, self.item_bias_, self.glob_mean_, - np.array(user).astype(ctypes.c_int), - np.array(item).astype(ctypes.c_int), + np.require(user, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1), + np.require(item, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1), self._k_pred, self.k_user, self.k_item, self._k_main_col, self.nthreads ) @@ -1280,15 +1269,15 @@ def _predict(self, user=None, a_vec=None, a_bias=0., item=None): self.user_bias_, self.item_bias_, self.glob_mean_, - np.array(user).astype(ctypes.c_int), - np.array(item).astype(ctypes.c_int), + np.require(user, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1), + np.require(item, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1), self._k_pred, self.k_user, self.k_item, self._k_main_col, self.nthreads ) #### When passing the factors directly else: - item = np.array([item]).reshape(-1) + item = np.require(item, requirements=["ENSUREARRAY"]).reshape(-1) nan_entries = (item == -1) outp = self._B_pred[item, self.k_item:].reshape((item.shape[0],-1)).dot(a_vec[self.k_user:]) outp += a_bias + self.glob_mean_ @@ -1318,7 +1307,7 @@ def _predict_new(self, user, B): np.zeros(n, dtype=self.dtype_) if self.item_bias \ else np.empty(0, dtype=self.dtype_), self.glob_mean_, - np.array(user).astype(ctypes.c_int), + np.require(user, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1), np.arange(n).astype(ctypes.c_int), self._k_pred, self.k_user, self.k_item, self._k_main_col, self.nthreads @@ -1347,7 +1336,7 @@ def _predict_user_multiple(self, A, item, bias=None): self.item_bias_, self.glob_mean_, np.arange(m).astype(ctypes.c_int), - np.array(item).astype(ctypes.c_int), + np.require(item, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1), self._k_pred, self.k_user, self.k_item, self._k_main_col, self.nthreads ) @@ -1359,7 +1348,7 @@ def _predict_user_multiple(self, A, item, bias=None): self.item_bias_, self.glob_mean_, np.arange(m).astype(ctypes.c_int), - np.array(item).astype(ctypes.c_int), + np.require(item, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1), self._k_pred, self.k_user, self.k_item, self._k_main_col, self.nthreads ) @@ -1436,7 +1425,7 @@ def _topN(self, user=None, a_vec=None, a_bias=0, B=None, msg += "fewer than 'n' to rank." raise ValueError(msg) - if user is not None: + if (user is not None) and (user.min() >= 0): user = user[0] a_vec = self._A_pred[user].reshape(-1) user_bias_ = 0. @@ -1587,15 +1576,11 @@ def _factors_warm_common(self, X=None, X_col=None, X_val=None, W=None, W_sp = np.empty(0, dtype=self.dtype_) if len(X.shape) > 1: warnings.warn("Passed a 2-d array for 'X' - method expects a single row.") - X = np.array(X).reshape(-1) - if X.dtype != self.dtype_: - X = X.astype(self.dtype_) + X = np.require(X, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) if X.shape[0] != self._n_orig: raise ValueError("'X' must have the same columns as when passed to 'fit'.") if W is not None: - W_dense = np.array(W).reshape(-1) - if W_dense.dtype != self.dtype_: - W_dense = W_dense.astype(self.dtype_) + W_dense = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) if W_dense.shape[0] != X.shape[0]: raise ValueError("'W' must have the same number of entries as X.") else: @@ -1603,28 +1588,20 @@ def _factors_warm_common(self, X=None, X_col=None, X_val=None, W=None, else: X = np.empty(0, dtype=self.dtype_) W_dense = np.empty(0, dtype=self.dtype_) - X_val = np.array(X_val).reshape(-1) - if X_val.dtype != self.dtype_: - X_val = X_val.astype(self.dtype_) + X_val = np.require(X_val, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) if X_val.shape[0] == 0: - X_col = np.array(X_col).reshape(-1) - if X_col.dtype != ctypes.c_int: - X_col = X_col.astype(ctypes.c_int) + X_col = np.require(X_col, requirements=["ENSUREARRAY"]).reshape(-1) if X_col.shape[0] > 0: raise ValueError("'X_col' and 'X_val' must have the same number of entries.") else: if self.reindex_: - X_col = np.array(X_col).reshape(-1) X_col = pd.Categorical(X_col, self.item_mapping_).codes - if X_col.dtype != ctypes.c_int: - X_col = X_col.astype(ctypes.c_int) + X_col = np.require(X_col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) if np.any(X_col < 0): raise ValueError("'X_col' must have the same item/column entries as passed to 'fit'.") else: - X_col = np.array(X_col).reshape(-1) - if X_col.dtype != ctypes.c_int: - X_col = X_col.astype(ctypes.c_int) + X_col = np.require(X_col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) imin, imax = np.min(X_col), np.max(X_col) if (imin < 0) or (imax >= self._n_orig) or np.isnan(imin) or np.isnan(imax): msg = "Column indices ('X_col') must be within the range" @@ -1639,9 +1616,7 @@ def _factors_warm_common(self, X=None, X_col=None, X_val=None, W=None, raise ValueError("'X' is empty.") if W is not None: - W_sp = np.array(W).reshape(-1) - if W_sp.dtype != self.dtype_: - W_sp = W_sp.astype(self.dtype_) + W_sp = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) if W_sp.shape[0] != X_col.shape[0]: raise ValueError("'W' must have the same number of entries as 'X_val'.") else: @@ -1665,26 +1640,25 @@ def _factors_warm_common(self, X=None, X_col=None, X_val=None, W=None, def _process_transform_inputs(self, X, U, U_bin, W, replace_existing): if (W is not None) and (issparse(W) != issparse(X)): raise ValueError("'X' and 'W' must be in the same format.") - if issparse(X) and not isspmatrix_coo(X): + if issparse(X) and not (X.format == "coo"): if (W is not None) and (not issparse(W)): - if not isinstance(W, np.ndarray): - W = np.array(W).reshape(-1) - if W.shape[0] != X.nnz: - raise ValueError("'X' and 'W' must have the same number of entries.") - if isspmatrix_csr(X): - W = csr_matrix((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1])) + W = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) + if W.shape[0] != X.data.shape[0]: + raise ValueError("'X' and 'W' must have the same number of entries.") + if _is_csr(X): + W = csr_array((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1]), dtype=self.dtype_) W = W.tocoo() - elif isspmatrix_csc(X): - W = csc_matrix((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1])) + elif _is_csc(X): + W = csc_array((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1]), dtype=self.dtype_) W = W.tocoo() else: raise ValueError("Must pass 'X' as SciPy COO if there are weights.") X = X.tocoo() - if issparse(W) and not isspmatrix_coo(W): + if issparse(W) and not (W.format == "coo"): W = W.tocoo() if issparse(W): - W = W.data - if issparse(U) and (not isspmatrix_coo(U)) and (not isspmatrix_csr(U)): + W = np.require(W.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) + if issparse(U) and (U.format not in ["coo", "csr"]): U = U.tocoo() if (X is None) and (U is None) and (U_bin is None): @@ -1697,7 +1671,7 @@ def _process_transform_inputs(self, X, U, U_bin, W, replace_existing): raise ValueError("Must pass 'X' if not passing 'replace_existing'.") if isinstance(X, np.ndarray): mask_take = ~pd.isnull(X) - elif isspmatrix_coo(X): + elif _is_coo(X): mask_take = np.repeat(False, X.shape[0]*X.shape[1]).reshape((X.shape[0], X.shape[1])) mask_take[X.row, X.col] = True else: @@ -1773,13 +1747,13 @@ def _transform_step(self, A, A_bias, mask_take, Xorig): if self.item_bias: outp += self.item_bias_.reshape((1,-1)) - if issparse(Xorig) and not isspmatrix_coo(Xorig): + if issparse(Xorig) and not (Xorig.format == "coo"): Xorig = Xorig.tocoo() if mask_take is not None: if isinstance(Xorig, np.ndarray): outp[mask_take] = Xorig[mask_take] - elif isspmatrix_coo(X): + elif _is_coo(X): outp[mask_take] = Xorig.data else: raise ValueError("'X' must be a SciPy COO matrix or NumPy array.") @@ -2918,7 +2892,7 @@ def __init__(self, k=40, lambda_=1e+1, method="als", use_cg=True, nonneg=False, nonneg_C=False, nonneg_D=False, max_cd_steps=100, precompute_for_predictions=True, include_all_X=True, use_float=True, - random_state=1, verbose=True, print_every=10, + random_state=1, verbose=False, print_every=10, handle_interrupt=True, produce_dicts=False, nthreads=-1, n_jobs=None): self.k = k @@ -4316,10 +4290,10 @@ def from_model_matrices(A, B, glob_mean=0., precompute=True, ): raise ValueError("Must pass both 'scaling_biasA' and 'scaling_biasB'.") - if (not isinstance(A, np.ndarray)) or (not A.flags["C_CONTIGUOUS"]): - A = np.ascontiguousarray(A) - if (not isinstance(B, np.ndarray)) or (not B.flags["C_CONTIGUOUS"]): - B = np.ascontiguousarray(B) + dtype = ctypes.c_double if not use_float else ctypes.c_float + A = np.require(A, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) + B = np.require(B, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) + if (len(A.shape) != 2) or (len(B.shape) != 2): raise ValueError("Model matrices must be 2-dimensional.") @@ -4348,31 +4322,14 @@ def from_model_matrices(A, B, glob_mean=0., precompute=True, n_jobs = n_jobs) new_model._init() - dtype = ctypes.c_double if not use_float else ctypes.c_float - if user_bias is not None: - if not isinstance(user_bias, np.ndarray): - user_bias = np.array(user_bias).reshape(-1) + user_bias = np.require(user_bias, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) if user_bias.shape[0] != A.shape[0]: raise ValueError("'user_bias' dimension does not match with 'A'.") - if not user_bias.flags["C_CONTIGUOUS"]: - user_bias = np.ascontiguousarray(user_bias) - if user_bias.dtype != dtype: - user_bias = user_bias.astype(dtype) if item_bias is not None: - if not isinstance(item_bias, np.ndarray): - item_bias = np.array(item_bias).reshape(-1) + item_bias = np.require(item_bias, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1) if item_bias.shape[0] != B.shape[0]: raise ValueError("'item_bias' dimension does not match with 'B'.") - if not item_bias.flags["C_CONTIGUOUS"]: - item_bias = np.ascontiguousarray(item_bias) - if item_bias.dtype != dtype: - item_bias = item_bias.astype(dtype) - - if (A.dtype != dtype): - A = A.astype(dtype) - if (B.dtype != dtype): - B = B.astype(dtype) new_model.A_ = A new_model.B_ = B @@ -4915,9 +4872,9 @@ def fit(self, X, U=None, I=None): """ self._init() - if issparse(X) and not isspmatrix_coo(X): + if issparse(X) and not (X.format == "coo"): X = X.tocoo() - if not isspmatrix_coo(X) and not isinstance(X, pd.DataFrame): + if not _is_coo(X) and not isinstance(X, pd.DataFrame): raise ValueError("'X' must be a Pandas DataFrame or SciPy sparse COO matrix.") return self._fit_common(X, U=U, I=I, U_bin=None, I_bin=None, W=None) @@ -5662,10 +5619,10 @@ def from_model_matrices(A, B, precompute=True, prediction methods such as ``topN`` and ``topN_warm`` can be used as if it had been fitted through this software. """ - if (not isinstance(A, np.ndarray)) or (not A.flags["C_CONTIGUOUS"]): - A = np.ascontiguousarray(A) - if (not isinstance(B, np.ndarray)) or (not B.flags["C_CONTIGUOUS"]): - B = np.ascontiguousarray(B) + dtype = ctypes.c_double if not use_float else ctypes.c_float + A = np.require(A, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) + B = np.require(B, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]) + if (len(A.shape) != 2) or (len(B.shape) != 2): raise ValueError("Model matrices must be 2-dimensional.") @@ -5676,8 +5633,7 @@ def from_model_matrices(A, B, precompute=True, raise ValueError("Empty model matrices not supported.") - dtype = ctypes.c_double if not use_float else ctypes.c_float - + new_model = CMF_implicit(k = k, lambda_ = lambda_, l1_lambda = l1_lambda, @@ -5689,11 +5645,6 @@ def from_model_matrices(A, B, precompute=True, n_jobs = n_jobs) new_model._init() - if (A.dtype != dtype): - A = A.astype(dtype) - if (B.dtype != dtype): - B = B.astype(dtype) - new_model.A_ = A new_model.B_ = B @@ -6386,7 +6337,7 @@ def __init__(self, k=50, lambda_=1e1, method="lbfgs", use_cg=True, maxiter=10000, niter=10, parallelize="separate", corr_pairs=7, max_cg_steps=3, precondition_cg=False, finalize_chol=True, NA_as_zero=False, use_float=False, - random_state=1, verbose=True, print_every=100, + random_state=1, verbose=False, print_every=100, produce_dicts=False, handle_interrupt=True, nthreads=-1, n_jobs=None): self.k = k @@ -7907,7 +7858,7 @@ class ContentBased(_OMF_Base): """ def __init__(self, k=20, lambda_=1e2, user_bias=False, item_bias=False, add_intercepts=True, maxiter=3000, corr_pairs=3, - parallelize="separate", verbose=True, print_every=100, + parallelize="separate", verbose=False, print_every=100, random_state=1, use_float=True, produce_dicts=False, handle_interrupt=True, start_with_ALS=True, nthreads=-1, n_jobs=None): @@ -8310,7 +8261,7 @@ def predict_cold(self, U, items): Predicted ratings for the requested user-item combinations. """ assert self.is_fitted_ - items = np.array(items).reshape(-1) + items = np.require(items, requirements=["ENSUREARRAY"]).reshape(-1) assert items.shape[0] == U.shape[0] _1, items, _2, _3 = self._process_users_items(None, items, None, None) diff --git a/example/cmfrec_movielens_sideinfo.ipynb b/example/cmfrec_movielens_sideinfo.ipynb index b65af42..64fab36 100644 --- a/example/cmfrec_movielens_sideinfo.ipynb +++ b/example/cmfrec_movielens_sideinfo.ipynb @@ -182,7 +182,6 @@ " \n", " \n", " ItemId\n", - " pc0\n", " pc1\n", " pc2\n", " pc3\n", @@ -191,8 +190,8 @@ " pc6\n", " pc7\n", " pc8\n", + " pc9\n", " ...\n", - " pc40\n", " pc41\n", " pc42\n", " pc43\n", @@ -202,128 +201,129 @@ " pc47\n", " pc48\n", " pc49\n", + " pc50\n", " \n", " \n", " \n", " \n", " 0\n", " 1\n", - " 1.192433\n", - " 2.034965\n", - " 2.679781\n", - " 1.154823\n", - " 0.715302\n", - " 0.982528\n", - " 1.251208\n", - " -0.792800\n", - " 1.605826\n", + " 1.193171\n", + " 2.085621\n", + " 2.634135\n", + " 1.156088\n", + " 0.721649\n", + " 0.995436\n", + " 1.250474\n", + " -0.779532\n", + " 1.616702\n", " ...\n", - " -0.312568\n", - " -0.089161\n", - " -0.053227\n", - " 0.230116\n", - " 0.210211\n", - " 0.098109\n", - " -0.267214\n", - " -0.191760\n", - " 0.032658\n", - " 0.065116\n", + " -0.317134\n", + " -0.070338\n", + " -0.019553\n", + " 0.169051\n", + " 0.201415\n", + " -0.094831\n", + " -0.250461\n", + " -0.149919\n", + " -0.031735\n", + " -0.177708\n", " \n", " \n", " 1\n", " 2\n", - " -1.333200\n", - " 1.719346\n", - " 1.383137\n", - " 0.788332\n", - " -0.487431\n", - " 0.376546\n", - " 0.803104\n", - " -0.606602\n", - " 0.914494\n", + " -1.333533\n", + " 1.743796\n", + " 1.352161\n", + " 0.795724\n", + " -0.484175\n", + " 0.380645\n", + " 0.804462\n", + " -0.598527\n", + " 0.917250\n", " ...\n", - " 0.265190\n", - " -0.294507\n", - " 0.058127\n", - " 0.013155\n", - " 0.232314\n", - " 0.332297\n", - " 0.271467\n", - " 0.112416\n", - " -0.111115\n", - " -0.042173\n", + " 0.300060\n", + " -0.261956\n", + " 0.054457\n", + " 0.003863\n", + " 0.304605\n", + " -0.315796\n", + " 0.360203\n", + " 0.152770\n", + " 0.144790\n", + " -0.096549\n", " \n", " \n", " 2\n", " 3\n", - " -1.363421\n", - " -0.034093\n", - " 0.528633\n", - " -0.312122\n", - " 0.468820\n", - " 0.164593\n", - " 0.021909\n", - " 0.161554\n", - " -0.231992\n", + " -1.363395\n", + " -0.017107\n", + " 0.530395\n", + " -0.316202\n", + " 0.469430\n", + " 0.164630\n", + " 0.019083\n", + " 0.159188\n", + " -0.232969\n", " ...\n", - " 0.212216\n", - " -0.103897\n", - " -0.279957\n", - " 0.032861\n", - " 0.054336\n", - " 0.212665\n", - " -0.174429\n", - " -0.105532\n", - " -0.147704\n", - " 0.137516\n", + " 0.215020\n", + " -0.060682\n", + " -0.280852\n", + " 0.001087\n", + " 0.084960\n", + " -0.257190\n", + " -0.136963\n", + " -0.113914\n", + " 0.128352\n", + " -0.203658\n", " \n", " \n", " 3\n", " 4\n", - " -1.238094\n", - " -1.014399\n", - " 0.790394\n", - " -0.296004\n", - " -0.095043\n", - " -0.052266\n", - " -0.180244\n", - " -0.768811\n", - " -0.400559\n", + " -1.237840\n", + " -0.993731\n", + " 0.809815\n", + " -0.303009\n", + " -0.088991\n", + " -0.049621\n", + " -0.179544\n", + " -0.771278\n", + " -0.400499\n", " ...\n", - " 0.074246\n", - " 0.033976\n", - " -0.225773\n", - " 0.416155\n", - " 0.282287\n", - " -0.324412\n", - " -0.228171\n", - " -0.191667\n", - " -0.488943\n", - " -0.468794\n", + " 0.066207\n", + " 0.056054\n", + " -0.223027\n", + " 0.400157\n", + " 0.292300\n", + " 0.260936\n", + " -0.307608\n", + " -0.224141\n", + " 0.488955\n", + " 0.439189\n", " \n", " \n", " 4\n", " 5\n", - " -1.613220\n", - " -0.280142\n", - " 1.119149\n", - " -0.130238\n", - " 0.397091\n", - " 0.187158\n", - " 0.108864\n", - " -0.273748\n", - " -0.260166\n", + " -1.611499\n", + " -0.251899\n", + " 1.126443\n", + " -0.135702\n", + " 0.403340\n", + " 0.187289\n", + " 0.108451\n", + " -0.275341\n", + " -0.261142\n", " ...\n", - " 0.110984\n", - " -0.126241\n", - " -0.234988\n", - " 0.487649\n", - " -0.027990\n", - " 0.103862\n", - " -0.218475\n", - " -0.315778\n", - " -0.070719\n", - " 0.052140\n", + " 0.109560\n", + " -0.086042\n", + " -0.236327\n", + " 0.461589\n", + " 0.013350\n", + " -0.192557\n", + " -0.234025\n", + " -0.369643\n", + " -0.041060\n", + " -0.074656\n", " \n", " \n", "\n", @@ -331,26 +331,26 @@ "" ], "text/plain": [ - " ItemId pc0 pc1 pc2 pc3 pc4 pc5 \\\n", - "0 1 1.192433 2.034965 2.679781 1.154823 0.715302 0.982528 \n", - "1 2 -1.333200 1.719346 1.383137 0.788332 -0.487431 0.376546 \n", - "2 3 -1.363421 -0.034093 0.528633 -0.312122 0.468820 0.164593 \n", - "3 4 -1.238094 -1.014399 0.790394 -0.296004 -0.095043 -0.052266 \n", - "4 5 -1.613220 -0.280142 1.119149 -0.130238 0.397091 0.187158 \n", + " ItemId pc1 pc2 pc3 pc4 pc5 pc6 \\\n", + "0 1 1.193171 2.085621 2.634135 1.156088 0.721649 0.995436 \n", + "1 2 -1.333533 1.743796 1.352161 0.795724 -0.484175 0.380645 \n", + "2 3 -1.363395 -0.017107 0.530395 -0.316202 0.469430 0.164630 \n", + "3 4 -1.237840 -0.993731 0.809815 -0.303009 -0.088991 -0.049621 \n", + "4 5 -1.611499 -0.251899 1.126443 -0.135702 0.403340 0.187289 \n", "\n", - " pc6 pc7 pc8 ... pc40 pc41 pc42 pc43 \\\n", - "0 1.251208 -0.792800 1.605826 ... -0.312568 -0.089161 -0.053227 0.230116 \n", - "1 0.803104 -0.606602 0.914494 ... 0.265190 -0.294507 0.058127 0.013155 \n", - "2 0.021909 0.161554 -0.231992 ... 0.212216 -0.103897 -0.279957 0.032861 \n", - "3 -0.180244 -0.768811 -0.400559 ... 0.074246 0.033976 -0.225773 0.416155 \n", - "4 0.108864 -0.273748 -0.260166 ... 0.110984 -0.126241 -0.234988 0.487649 \n", + " pc7 pc8 pc9 ... pc41 pc42 pc43 pc44 \\\n", + "0 1.250474 -0.779532 1.616702 ... -0.317134 -0.070338 -0.019553 0.169051 \n", + "1 0.804462 -0.598527 0.917250 ... 0.300060 -0.261956 0.054457 0.003863 \n", + "2 0.019083 0.159188 -0.232969 ... 0.215020 -0.060682 -0.280852 0.001087 \n", + "3 -0.179544 -0.771278 -0.400499 ... 0.066207 0.056054 -0.223027 0.400157 \n", + "4 0.108451 -0.275341 -0.261142 ... 0.109560 -0.086042 -0.236327 0.461589 \n", "\n", - " pc44 pc45 pc46 pc47 pc48 pc49 \n", - "0 0.210211 0.098109 -0.267214 -0.191760 0.032658 0.065116 \n", - "1 0.232314 0.332297 0.271467 0.112416 -0.111115 -0.042173 \n", - "2 0.054336 0.212665 -0.174429 -0.105532 -0.147704 0.137516 \n", - "3 0.282287 -0.324412 -0.228171 -0.191667 -0.488943 -0.468794 \n", - "4 -0.027990 0.103862 -0.218475 -0.315778 -0.070719 0.052140 \n", + " pc45 pc46 pc47 pc48 pc49 pc50 \n", + "0 0.201415 -0.094831 -0.250461 -0.149919 -0.031735 -0.177708 \n", + "1 0.304605 -0.315796 0.360203 0.152770 0.144790 -0.096549 \n", + "2 0.084960 -0.257190 -0.136963 -0.113914 0.128352 -0.203658 \n", + "3 0.292300 0.260936 -0.307608 -0.224141 0.488955 0.439189 \n", + "4 0.013350 -0.192557 -0.234025 -0.369643 -0.041060 -0.074656 \n", "\n", "[5 rows x 51 columns]" ] @@ -424,122 +424,122 @@ " \n", " 0\n", " 1\n", - " 1\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " True\n", + " False\n", + " True\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", " ...\n", - " 0\n", - " 0\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " False\n", + " False\n", + " False\n", + " True\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " 1\n", " 2\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 1\n", + " False\n", + " True\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " True\n", " ...\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " True\n", + " False\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " 2\n", " 3\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " False\n", + " True\n", + " False\n", + " False\n", + " True\n", + " False\n", + " False\n", + " False\n", + " False\n", " ...\n", - " 0\n", - " 0\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " False\n", + " False\n", + " False\n", + " True\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " 3\n", " 4\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", + " False\n", + " True\n", + " False\n", + " False\n", + " False\n", + " False\n", + " True\n", + " False\n", + " False\n", " ...\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " False\n", + " False\n", + " False\n", + " False\n", + " True\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " 4\n", " 5\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " False\n", + " True\n", + " False\n", + " False\n", + " True\n", + " False\n", + " False\n", + " False\n", + " False\n", " ...\n", - " 0\n", - " 1\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " False\n", + " True\n", + " False\n", + " True\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", " \n", " \n", "\n", @@ -548,32 +548,32 @@ ], "text/plain": [ " UserId Gender_F Gender_M Age_1 Age_18 Age_25 Age_35 Age_45 Age_50 \\\n", - "0 1 1 0 1 0 0 0 0 0 \n", - "1 2 0 1 0 0 0 0 0 0 \n", - "2 3 0 1 0 0 1 0 0 0 \n", - "3 4 0 1 0 0 0 0 1 0 \n", - "4 5 0 1 0 0 1 0 0 0 \n", + "0 1 True False True False False False False False \n", + "1 2 False True False False False False False False \n", + "2 3 False True False False True False False False \n", + "3 4 False True False False False False True False \n", + "4 5 False True False False True False False False \n", "\n", " Age_56 ... Occupation_unemployed Occupation_writer \\\n", - "0 0 ... 0 0 \n", - "1 1 ... 0 0 \n", - "2 0 ... 0 0 \n", - "3 0 ... 0 0 \n", - "4 0 ... 0 1 \n", + "0 False ... False False \n", + "1 True ... False False \n", + "2 False ... False False \n", + "3 False ... False False \n", + "4 False ... False True \n", "\n", " Region_Middle Atlantic Region_Midwest Region_New England Region_South \\\n", - "0 0 1 0 0 \n", - "1 0 0 0 1 \n", - "2 0 1 0 0 \n", - "3 0 0 1 0 \n", - "4 0 1 0 0 \n", + "0 False True False False \n", + "1 False False False True \n", + "2 False True False False \n", + "3 False False True False \n", + "4 False True False False \n", "\n", " Region_Southwest Region_UnknownOrNonUS Region_UsOther Region_West \n", - "0 0 0 0 0 \n", - "1 0 0 0 0 \n", - "2 0 0 0 0 \n", - "3 0 0 0 0 \n", - "4 0 0 0 0 \n", + "0 False False False False \n", + "1 False False False False \n", + "2 False False False False \n", + "3 False False False False \n", + "4 False False False False \n", "\n", "[5 rows x 39 columns]" ] @@ -627,8 +627,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 13 s, sys: 105 ms, total: 13.1 s\n", - "Wall time: 892 ms\n" + "CPU times: user 6.75 s, sys: 1.56 s, total: 8.31 s\n", + "Wall time: 592 ms\n" ] }, { @@ -684,8 +684,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 17.2 s, sys: 168 ms, total: 17.4 s\n", - "Wall time: 1.18 s\n" + "CPU times: user 11.2 s, sys: 13 s, total: 24.2 s\n", + "Wall time: 1.5 s\n" ] }, { @@ -740,8 +740,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 26min 6s, sys: 9.69 s, total: 26min 16s\n", - "Wall time: 1min 39s\n" + "CPU times: user 13min 8s, sys: 23min 31s, total: 36min 39s\n", + "Wall time: 1min 57s\n" ] }, { @@ -761,9 +761,9 @@ "from cmfrec import ContentBased\n", "\n", "model_content_based = ContentBased(k=40, maxiter=0, user_bias=False, item_bias=False)\n", - "model_content_based.fit(X=ratings.loc[ratings.ItemId.isin(item_sideinfo_pca.ItemId)],\n", + "model_content_based.fit(X=ratings.loc[lambda x: x[\"ItemId\"].isin(item_sideinfo_pca[\"ItemId\"])],\n", " U=user_side_info,\n", - " I=item_sideinfo_pca.loc[item_sideinfo_pca.ItemId.isin(ratings.ItemId)])" + " I=item_sideinfo_pca.loc[lambda x: x[\"ItemId\"].isin(ratings[\"ItemId\"])])" ] }, { @@ -784,8 +784,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.02 s, sys: 39.9 ms, total: 1.06 s\n", - "Wall time: 70.6 ms\n" + "CPU times: user 304 ms, sys: 800 ms, total: 1.1 s\n", + "Wall time: 105 ms\n" ] }, { @@ -852,35 +852,35 @@ " \n", " \n", " UserId\n", - " 948.0\n", + " 948\n", " \n", " \n", " Gender_M\n", - " 1.0\n", + " True\n", " \n", " \n", " Age_56\n", - " 1.0\n", + " True\n", " \n", " \n", " Occupation_programmer\n", - " 1.0\n", + " True\n", " \n", " \n", " Region_Midwest\n", - " 1.0\n", + " True\n", " \n", " \n", "\n", "" ], "text/plain": [ - " 947\n", - "UserId 948.0\n", - "Gender_M 1.0\n", - "Age_56 1.0\n", - "Occupation_programmer 1.0\n", - "Region_Midwest 1.0" + " 947\n", + "UserId 948\n", + "Gender_M True\n", + "Age_56 True\n", + "Occupation_programmer True\n", + "Region_Midwest True" ] }, "execution_count": 9, @@ -889,7 +889,7 @@ } ], "source": [ - "user_side_info.loc[user_side_info.UserId == 948].T.where(lambda x: x > 0).dropna()" + "user_side_info.loc[user_side_info[\"UserId\"] == 948].T.where(lambda x: x > 0).dropna()" ] }, { @@ -1026,11 +1026,13 @@ } ], "source": [ - "ratings\\\n", - " .loc[ratings.UserId == 948]\\\n", - " .sort_values(\"Rating\", ascending=False)\\\n", - " .assign(Movie=lambda x: x.ItemId.map(movie_id_to_title))\\\n", - " .head(10)" + "(\n", + " ratings\n", + " .loc[lambda x: x[\"UserId\"] == 948]\n", + " .sort_values(\"Rating\", ascending=False)\n", + " .assign(Movie=lambda x: x[\"ItemId\"].map(movie_id_to_title))\n", + " .head(10)\n", + ")" ] }, { @@ -1179,11 +1181,13 @@ } ], "source": [ - "ratings\\\n", - " .loc[ratings.UserId == 948]\\\n", - " .sort_values(\"Rating\", ascending=True)\\\n", - " .assign(Movie=lambda x: x.ItemId.map(movie_id_to_title))\\\n", - " .head(10)" + "(\n", + " ratings\n", + " .loc[lambda x: x[\"UserId\"] == 948]\n", + " .sort_values(\"Rating\", ascending=True)\n", + " .assign(Movie=lambda x: x[\"ItemId\"].map(movie_id_to_title))\n", + " .head(10)\n", + ")" ] }, { @@ -1200,8 +1204,8 @@ "outputs": [], "source": [ "### Will exclude already-seen movies\n", - "exclude = ratings.ItemId.loc[ratings.UserId == 948]\n", - "exclude_cb = exclude.loc[exclude.isin(item_sideinfo_pca.ItemId)]\n", + "exclude = ratings[\"ItemId\"].loc[ratings[\"UserId\"] == 948]\n", + "exclude_cb = exclude.loc[lambda x: x.isin(item_sideinfo_pca[\"ItemId\"])]\n", "\n", "### Recommended lists with those excluded\n", "recommended_non_personalized = model_non_personalized.topN(user=948, n=10, exclude=exclude)\n", @@ -1259,16 +1263,16 @@ "10) - City Lights (1931) - Average Rating: 4.39 - Number of ratings: 271\n", "----------------\n", "Recommended from ratings-only model\n", - "1) - Babe (1995) - Average Rating: 3.89 - Number of ratings: 1751\n", - "2) - Singin' in the Rain (1952) - Average Rating: 4.28 - Number of ratings: 751\n", - "3) - Mummy, The (1932) - Average Rating: 3.54 - Number of ratings: 162\n", - "4) - Gold Rush, The (1925) - Average Rating: 4.19 - Number of ratings: 275\n", - "5) - Bride of Frankenstein (1935) - Average Rating: 3.91 - Number of ratings: 216\n", - "6) - City Lights (1931) - Average Rating: 4.39 - Number of ratings: 271\n", - "7) - Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922) - Average Rating: 3.99 - Number of ratings: 238\n", + "1) - Arsenic and Old Lace (1944) - Average Rating: 4.17 - Number of ratings: 672\n", + "2) - Beauty and the Beast (1991) - Average Rating: 3.89 - Number of ratings: 1060\n", + "3) - Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922) - Average Rating: 3.99 - Number of ratings: 238\n", + "4) - It's a Wonderful Life (1946) - Average Rating: 4.3 - Number of ratings: 729\n", + "5) - Invasion of the Body Snatchers (1956) - Average Rating: 3.91 - Number of ratings: 628\n", + "6) - Hurricane, The (1999) - Average Rating: 3.85 - Number of ratings: 509\n", + "7) - Contender, The (2000) - Average Rating: 3.78 - Number of ratings: 388\n", "8) - Wolf Man, The (1941) - Average Rating: 3.76 - Number of ratings: 134\n", - "9) - American History X (1998) - Average Rating: 4.23 - Number of ratings: 640\n", - "10) - Chariots of Fire (1981) - Average Rating: 3.8 - Number of ratings: 634\n", + "9) - Apostle, The (1997) - Average Rating: 3.73 - Number of ratings: 471\n", + "10) - Mummy, The (1932) - Average Rating: 3.54 - Number of ratings: 162\n", "----------------\n", "Recommended from attributes-only model\n", "1) - Shawshank Redemption, The (1994) - Average Rating: 4.55 - Number of ratings: 2227\n", @@ -1277,22 +1281,22 @@ "4) - Jean de Florette (1986) - Average Rating: 4.32 - Number of ratings: 216\n", "5) - It Happened One Night (1934) - Average Rating: 4.28 - Number of ratings: 374\n", "6) - Central Station (Central do Brasil) (1998) - Average Rating: 4.28 - Number of ratings: 215\n", - "7) - Best Years of Our Lives, The (1946) - Average Rating: 4.12 - Number of ratings: 236\n", - "8) - Man Who Would Be King, The (1975) - Average Rating: 4.13 - Number of ratings: 310\n", - "9) - In the Heat of the Night (1967) - Average Rating: 4.13 - Number of ratings: 348\n", - "10) - Double Indemnity (1944) - Average Rating: 4.42 - Number of ratings: 551\n", + "7) - Man Who Would Be King, The (1975) - Average Rating: 4.13 - Number of ratings: 310\n", + "8) - Best Years of Our Lives, The (1946) - Average Rating: 4.12 - Number of ratings: 236\n", + "9) - Double Indemnity (1944) - Average Rating: 4.42 - Number of ratings: 551\n", + "10) - In the Heat of the Night (1967) - Average Rating: 4.13 - Number of ratings: 348\n", "----------------\n", "Recommended from hybrid model\n", - "1) - Babe (1995) - Average Rating: 3.89 - Number of ratings: 1751\n", - "2) - It's a Wonderful Life (1946) - Average Rating: 4.3 - Number of ratings: 729\n", + "1) - It's a Wonderful Life (1946) - Average Rating: 4.3 - Number of ratings: 729\n", + "2) - Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922) - Average Rating: 3.99 - Number of ratings: 238\n", "3) - Beauty and the Beast (1991) - Average Rating: 3.89 - Number of ratings: 1060\n", - "4) - Singin' in the Rain (1952) - Average Rating: 4.28 - Number of ratings: 751\n", - "5) - Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922) - Average Rating: 3.99 - Number of ratings: 238\n", - "6) - Bride of Frankenstein (1935) - Average Rating: 3.91 - Number of ratings: 216\n", - "7) - Gold Rush, The (1925) - Average Rating: 4.19 - Number of ratings: 275\n", - "8) - Invasion of the Body Snatchers (1956) - Average Rating: 3.91 - Number of ratings: 628\n", - "9) - Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) - Average Rating: 4.56 - Number of ratings: 628\n", - "10) - Green Mile, The (1999) - Average Rating: 4.15 - Number of ratings: 1222\n" + "4) - Arsenic and Old Lace (1944) - Average Rating: 4.17 - Number of ratings: 672\n", + "5) - Invasion of the Body Snatchers (1956) - Average Rating: 3.91 - Number of ratings: 628\n", + "6) - Mr. Smith Goes to Washington (1939) - Average Rating: 4.24 - Number of ratings: 383\n", + "7) - Life Is Beautiful (La Vita è bella) (1997) - Average Rating: 4.33 - Number of ratings: 1152\n", + "8) - Gold Rush, The (1925) - Average Rating: 4.19 - Number of ratings: 275\n", + "9) - Bride of Frankenstein (1935) - Average Rating: 3.91 - Number of ratings: 216\n", + "10) - Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) - Average Rating: 4.56 - Number of ratings: 628\n" ] } ], @@ -1386,15 +1390,15 @@ "output_type": "stream", "text": [ "1) - Plan 9 from Outer Space (1958) - Average Rating: 2.63 - Number of ratings: 249\n", - "2) - Anne Frank Remembered (1995) - Average Rating: 4.1 - Number of ratings: 41\n", - "3) - Next Friday (1999) - Average Rating: 2.6 - Number of ratings: 168\n", - "4) - Muppet Christmas Carol, The (1992) - Average Rating: 3.61 - Number of ratings: 262\n", - "5) - Snow Day (2000) - Average Rating: 2.21 - Number of ratings: 122\n", - "6) - Black Mask (Hak hap) (1996) - Average Rating: 3.08 - Number of ratings: 66\n", - "7) - Foreign Student (1994) - Average Rating: 3.0 - Number of ratings: 2\n", - "8) - Ballad of Narayama, The (Narayama Bushiko) (1982) - Average Rating: 3.95 - Number of ratings: 19\n", - "9) - Around the World in 80 Days (1956) - Average Rating: 3.6 - Number of ratings: 269\n", - "10) - Faust (1994) - Average Rating: 3.48 - Number of ratings: 31\n" + "2) - East-West (Est-ouest) (1999) - Average Rating: 3.77 - Number of ratings: 103\n", + "3) - Rugrats Movie, The (1998) - Average Rating: 2.78 - Number of ratings: 141\n", + "4) - Taste of Cherry (1997) - Average Rating: 3.53 - Number of ratings: 32\n", + "5) - Julien Donkey-Boy (1999) - Average Rating: 3.33 - Number of ratings: 12\n", + "6) - Original Kings of Comedy, The (2000) - Average Rating: 3.23 - Number of ratings: 147\n", + "7) - Maya Lin: A Strong Clear Vision (1994) - Average Rating: 4.1 - Number of ratings: 59\n", + "8) - Double Life of Veronique, The (La Double Vie de Véronique) (1991) - Average Rating: 3.94 - Number of ratings: 129\n", + "9) - Crash (1996) - Average Rating: 2.76 - Number of ratings: 141\n", + "10) - Faraway, So Close (In Weiter Ferne, So Nah!) (1993) - Average Rating: 3.71 - Number of ratings: 66\n" ] } ], @@ -1424,15 +1428,15 @@ "output_type": "stream", "text": [ "1) - Wrong Trousers, The (1993) - Average Rating: 4.51 - Number of ratings: 882\n", - "2) - Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) - Average Rating: 4.56 - Number of ratings: 628\n", - "3) - It's a Wonderful Life (1946) - Average Rating: 4.3 - Number of ratings: 729\n", - "4) - Third Man, The (1949) - Average Rating: 4.45 - Number of ratings: 480\n", - "5) - Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922) - Average Rating: 3.99 - Number of ratings: 238\n", + "2) - Willy Wonka and the Chocolate Factory (1971) - Average Rating: 3.86 - Number of ratings: 1313\n", + "3) - Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) - Average Rating: 4.56 - Number of ratings: 628\n", + "4) - It's a Wonderful Life (1946) - Average Rating: 4.3 - Number of ratings: 729\n", + "5) - Third Man, The (1949) - Average Rating: 4.45 - Number of ratings: 480\n", "6) - Close Shave, A (1995) - Average Rating: 4.52 - Number of ratings: 657\n", - "7) - Singin' in the Rain (1952) - Average Rating: 4.28 - Number of ratings: 751\n", - "8) - Shadow of a Doubt (1943) - Average Rating: 4.27 - Number of ratings: 233\n", - "9) - Citizen Kane (1941) - Average Rating: 4.39 - Number of ratings: 1116\n", - "10) - Christmas Carol, A (1938) - Average Rating: 3.99 - Number of ratings: 194\n" + "7) - Grand Day Out, A (1992) - Average Rating: 4.36 - Number of ratings: 473\n", + "8) - Citizen Kane (1941) - Average Rating: 4.39 - Number of ratings: 1116\n", + "9) - Singin' in the Rain (1952) - Average Rating: 4.28 - Number of ratings: 751\n", + "10) - Rebecca (1940) - Average Rating: 4.2 - Number of ratings: 386\n" ] } ], @@ -1466,22 +1470,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "1) - Babe (1995) - Average Rating: 3.89 - Number of ratings: 1751\n", - "2) - It's a Wonderful Life (1946) - Average Rating: 4.3 - Number of ratings: 729\n", + "1) - It's a Wonderful Life (1946) - Average Rating: 4.3 - Number of ratings: 729\n", + "2) - Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922) - Average Rating: 3.99 - Number of ratings: 238\n", "3) - Beauty and the Beast (1991) - Average Rating: 3.89 - Number of ratings: 1060\n", - "4) - Singin' in the Rain (1952) - Average Rating: 4.28 - Number of ratings: 751\n", - "5) - Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922) - Average Rating: 3.99 - Number of ratings: 238\n", - "6) - Bride of Frankenstein (1935) - Average Rating: 3.91 - Number of ratings: 216\n", - "7) - Gold Rush, The (1925) - Average Rating: 4.19 - Number of ratings: 275\n", - "8) - Invasion of the Body Snatchers (1956) - Average Rating: 3.91 - Number of ratings: 628\n", - "9) - Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) - Average Rating: 4.56 - Number of ratings: 628\n", - "10) - Green Mile, The (1999) - Average Rating: 4.15 - Number of ratings: 1222\n" + "4) - Arsenic and Old Lace (1944) - Average Rating: 4.17 - Number of ratings: 672\n", + "5) - Invasion of the Body Snatchers (1956) - Average Rating: 3.91 - Number of ratings: 628\n", + "6) - Mr. Smith Goes to Washington (1939) - Average Rating: 4.24 - Number of ratings: 383\n", + "7) - Life Is Beautiful (La Vita è bella) (1997) - Average Rating: 4.33 - Number of ratings: 1152\n", + "8) - Gold Rush, The (1925) - Average Rating: 4.19 - Number of ratings: 275\n", + "9) - Bride of Frankenstein (1935) - Average Rating: 3.91 - Number of ratings: 216\n", + "10) - Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) - Average Rating: 4.56 - Number of ratings: 628\n" ] } ], "source": [ - "print_reclist(model_with_sideinfo.topN_warm(X_col=ratings.ItemId.loc[ratings.UserId == 948],\n", - " X_val=ratings.Rating.loc[ratings.UserId == 948],\n", + "print_reclist(model_with_sideinfo.topN_warm(X_col=ratings[\"ItemId\"].loc[ratings[\"UserId\"] == 948],\n", + " X_val=ratings[\"Rating\"].loc[ratings[\"UserId\"] == 948],\n", " exclude=exclude))" ] }, @@ -1494,23 +1498,23 @@ "name": "stdout", "output_type": "stream", "text": [ - "1) - Babe (1995) - Average Rating: 3.89 - Number of ratings: 1751\n", - "2) - It's a Wonderful Life (1946) - Average Rating: 4.3 - Number of ratings: 729\n", + "1) - It's a Wonderful Life (1946) - Average Rating: 4.3 - Number of ratings: 729\n", + "2) - Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922) - Average Rating: 3.99 - Number of ratings: 238\n", "3) - Beauty and the Beast (1991) - Average Rating: 3.89 - Number of ratings: 1060\n", - "4) - Singin' in the Rain (1952) - Average Rating: 4.28 - Number of ratings: 751\n", - "5) - Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922) - Average Rating: 3.99 - Number of ratings: 238\n", - "6) - Bride of Frankenstein (1935) - Average Rating: 3.91 - Number of ratings: 216\n", - "7) - Gold Rush, The (1925) - Average Rating: 4.19 - Number of ratings: 275\n", - "8) - Invasion of the Body Snatchers (1956) - Average Rating: 3.91 - Number of ratings: 628\n", - "9) - Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) - Average Rating: 4.56 - Number of ratings: 628\n", - "10) - Green Mile, The (1999) - Average Rating: 4.15 - Number of ratings: 1222\n" + "4) - Arsenic and Old Lace (1944) - Average Rating: 4.17 - Number of ratings: 672\n", + "5) - Invasion of the Body Snatchers (1956) - Average Rating: 3.91 - Number of ratings: 628\n", + "6) - Mr. Smith Goes to Washington (1939) - Average Rating: 4.24 - Number of ratings: 383\n", + "7) - Life Is Beautiful (La Vita è bella) (1997) - Average Rating: 4.33 - Number of ratings: 1152\n", + "8) - Gold Rush, The (1925) - Average Rating: 4.19 - Number of ratings: 275\n", + "9) - Bride of Frankenstein (1935) - Average Rating: 3.91 - Number of ratings: 216\n", + "10) - Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) - Average Rating: 4.56 - Number of ratings: 628\n" ] } ], "source": [ - "print_reclist(model_with_sideinfo.topN_warm(X_col=ratings.ItemId.loc[ratings.UserId == 948],\n", - " X_val=ratings.Rating.loc[ratings.UserId == 948],\n", - " U=user_side_info.loc[user_side_info.UserId == 948],\n", + "print_reclist(model_with_sideinfo.topN_warm(X_col=ratings[\"ItemId\"].loc[ratings[\"UserId\"] == 948],\n", + " X_val=ratings[\"Rating\"].loc[ratings[\"UserId\"] == 948],\n", + " U=user_side_info.loc[lambda x: x[\"UserId\"] == 948],\n", " exclude=exclude))" ] }, @@ -1531,14 +1535,18 @@ "6) - Wallace & Gromit: The Best of Aardman Animation (1996) - Average Rating: 4.43 - Number of ratings: 438\n", "7) - Double Indemnity (1944) - Average Rating: 4.42 - Number of ratings: 551\n", "8) - Third Man, The (1949) - Average Rating: 4.45 - Number of ratings: 480\n", - "9) - Life Is Beautiful (La Vita � bella) (1997) - Average Rating: 4.33 - Number of ratings: 1152\n", + "9) - Life Is Beautiful (La Vita è bella) (1997) - Average Rating: 4.33 - Number of ratings: 1152\n", "10) - Grand Day Out, A (1992) - Average Rating: 4.36 - Number of ratings: 473\n" ] } ], "source": [ - "print_reclist(model_with_sideinfo.topN_cold(U=user_side_info.loc[user_side_info.UserId == 948].drop(\"UserId\", axis=1),\n", - " exclude=exclude))" + "print_reclist(\n", + " model_with_sideinfo.topN_cold(\n", + " U=user_side_info.loc[lambda x: x[\"UserId\"] == 948].drop(\"UserId\", axis=1),\n", + " exclude=exclude\n", + " )\n", + ")" ] }, { @@ -1563,16 +1571,20 @@ "4) - Jean de Florette (1986) - Average Rating: 4.32 - Number of ratings: 216\n", "5) - It Happened One Night (1934) - Average Rating: 4.28 - Number of ratings: 374\n", "6) - Central Station (Central do Brasil) (1998) - Average Rating: 4.28 - Number of ratings: 215\n", - "7) - Best Years of Our Lives, The (1946) - Average Rating: 4.12 - Number of ratings: 236\n", - "8) - Man Who Would Be King, The (1975) - Average Rating: 4.13 - Number of ratings: 310\n", - "9) - In the Heat of the Night (1967) - Average Rating: 4.13 - Number of ratings: 348\n", - "10) - Double Indemnity (1944) - Average Rating: 4.42 - Number of ratings: 551\n" + "7) - Man Who Would Be King, The (1975) - Average Rating: 4.13 - Number of ratings: 310\n", + "8) - Best Years of Our Lives, The (1946) - Average Rating: 4.12 - Number of ratings: 236\n", + "9) - Double Indemnity (1944) - Average Rating: 4.42 - Number of ratings: 551\n", + "10) - In the Heat of the Night (1967) - Average Rating: 4.13 - Number of ratings: 348\n" ] } ], "source": [ - "print_reclist(model_content_based.topN_cold(U=user_side_info.loc[user_side_info.UserId == 948].drop(\"UserId\", axis=1),\n", - " exclude=exclude_cb))" + "print_reclist(\n", + " model_content_based.topN_cold(\n", + " U=user_side_info.loc[lambda x: x[\"UserId\"] == 948].drop(\"UserId\", axis=1),\n", + " exclude=exclude_cb\n", + " )\n", + ")" ] }, { @@ -1627,40 +1639,40 @@ "text": [ "Number of ratings in training data: 512972\n", "Number of ratings in test data type (1): 128221\n", - "Number of ratings in test data type (2): 153128\n", - "Number of ratings in test data type (3): 138904\n", - "Number of ratings in test data type (4): 36450\n" + "Number of ratings in test data type (2): 154507\n", + "Number of ratings in test data type (3): 139009\n", + "Number of ratings in test data type (4): 36774\n" ] } ], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", - "users_train, users_test = train_test_split(ratings.UserId.unique(), test_size=0.2, random_state=1)\n", - "items_train, items_test = train_test_split(ratings.ItemId.unique(), test_size=0.2, random_state=2)\n", + "users_train, users_test = train_test_split(ratings[\"UserId\"].unique(), test_size=0.2, random_state=1)\n", + "items_train, items_test = train_test_split(ratings[\"ItemId\"].unique(), test_size=0.2, random_state=2)\n", "\n", - "ratings_train, ratings_test1 = train_test_split(ratings.loc[ratings.UserId.isin(users_train) &\n", - " ratings.ItemId.isin(items_train)],\n", + "ratings_train, ratings_test1 = train_test_split(ratings.loc[ratings[\"UserId\"].isin(users_train) &\n", + " ratings[\"ItemId\"].isin(items_train)],\n", " test_size=0.2, random_state=123)\n", - "users_train = ratings_train.UserId.unique()\n", - "items_train = ratings_train.ItemId.unique()\n", - "ratings_test1 = ratings_test1.loc[ratings_test1.UserId.isin(users_train) &\n", - " ratings_test1.ItemId.isin(items_train)]\n", - "\n", - "user_attr_train = user_side_info.loc[user_side_info.UserId.isin(users_train)]\n", - "item_attr_train = item_sideinfo_pca.loc[item_sideinfo_pca.ItemId.isin(items_train)]\n", - "\n", - "ratings_test2 = ratings.loc[ratings.UserId.isin(users_train) &\n", - " ~ratings.ItemId.isin(items_train) &\n", - " ratings.ItemId.isin(item_sideinfo_pca.ItemId)]\n", - "ratings_test3 = ratings.loc[~ratings.UserId.isin(users_train) &\n", - " ratings.ItemId.isin(items_train) &\n", - " ratings.UserId.isin(user_side_info.UserId) &\n", - " ratings.ItemId.isin(item_sideinfo_pca.ItemId)]\n", - "ratings_test4 = ratings.loc[~ratings.UserId.isin(users_train) &\n", - " ~ratings.ItemId.isin(items_train) &\n", - " ratings.UserId.isin(user_side_info.UserId) &\n", - " ratings.ItemId.isin(item_sideinfo_pca.ItemId)]\n", + "users_train = ratings_train[\"UserId\"].unique()\n", + "items_train = ratings_train[\"ItemId\"].unique()\n", + "ratings_test1 = ratings_test1.loc[ratings_test1[\"UserId\"].isin(users_train) &\n", + " ratings_test1[\"ItemId\"].isin(items_train)]\n", + "\n", + "user_attr_train = user_side_info.loc[lambda x: x[\"UserId\"].isin(users_train)]\n", + "item_attr_train = item_sideinfo_pca.loc[lambda x: x[\"ItemId\"].isin(items_train)]\n", + "\n", + "ratings_test2 = ratings.loc[ratings[\"UserId\"].isin(users_train) &\n", + " ~ratings[\"ItemId\"].isin(items_train) &\n", + " ratings[\"ItemId\"].isin(item_sideinfo_pca[\"ItemId\"])]\n", + "ratings_test3 = ratings.loc[~ratings[\"UserId\"].isin(users_train) &\n", + " ratings[\"ItemId\"].isin(items_train) &\n", + " ratings[\"UserId\"].isin(user_side_info[\"UserId\"]) &\n", + " ratings[\"ItemId\"].isin(item_sideinfo_pca[\"ItemId\"])]\n", + "ratings_test4 = ratings.loc[~ratings[\"UserId\"].isin(users_train) &\n", + " ~ratings[\"ItemId\"].isin(items_train) &\n", + " ratings[\"UserId\"].isin(user_side_info[\"UserId\"]) &\n", + " ratings[\"ItemId\"].isin(item_sideinfo_pca[\"ItemId\"])]\n", "\n", "\n", "print(\"Number of ratings in training data: %d\" % ratings_train.shape[0])\n", @@ -1701,8 +1713,8 @@ " U=user_attr_train,\n", " I=item_attr_train)\n", "m_contentbased = ContentBased(k=40, user_bias=False, item_bias=False)\\\n", - " .fit(X=ratings_train.loc[ratings_train.UserId.isin(user_attr_train.UserId) &\n", - " ratings_train.ItemId.isin(item_attr_train.ItemId)],\n", + " .fit(X=ratings_train.loc[ratings_train[\"UserId\"].isin(user_attr_train[\"UserId\"]) &\n", + " ratings_train[\"ItemId\"].isin(item_attr_train[\"ItemId\"])],\n", " U=user_attr_train,\n", " I=item_attr_train)\n", "m_mostpopular = MostPopular(user_bias=True)\\\n", @@ -1726,44 +1738,44 @@ "output_type": "stream", "text": [ "RMSE type 1 non-personalized model: 0.911 [rho: 0.580]\n", - "RMSE type 1 ratings-only model: 0.897 [rho: 0.603]\n", - "RMSE type 1 hybrid model: 0.860 [rho: 0.641]\n", - "RMSE type 1 content-based model: 0.975 [rho: 0.486]\n" + "RMSE type 1 ratings-only model: 0.896 [rho: 0.603]\n", + "RMSE type 1 hybrid model: 0.861 [rho: 0.640]\n", + "RMSE type 1 content-based model: 0.975 [rho: 0.487]\n" ] } ], "source": [ "from sklearn.metrics import mean_squared_error\n", "\n", - "pred_contetbased = m_mostpopular.predict(ratings_test1.UserId, ratings_test1.ItemId)\n", + "pred_contetbased = m_mostpopular.predict(ratings_test1[\"UserId\"], ratings_test1[\"ItemId\"])\n", "print(\"RMSE type 1 non-personalized model: %.3f [rho: %.3f]\" %\n", - " (np.sqrt(mean_squared_error(ratings_test1.Rating,\n", + " (np.sqrt(mean_squared_error(ratings_test1[\"Rating\"],\n", " pred_contetbased,\n", " squared=True)),\n", - " np.corrcoef(ratings_test1.Rating, pred_contetbased)[0,1]))\n", + " np.corrcoef(ratings_test1[\"Rating\"], pred_contetbased)[0,1]))\n", "\n", - "pred_ratingsonly = m_classic.predict(ratings_test1.UserId, ratings_test1.ItemId)\n", + "pred_ratingsonly = m_classic.predict(ratings_test1[\"UserId\"], ratings_test1[\"ItemId\"])\n", "print(\"RMSE type 1 ratings-only model: %.3f [rho: %.3f]\" %\n", - " (np.sqrt(mean_squared_error(ratings_test1.Rating,\n", + " (np.sqrt(mean_squared_error(ratings_test1[\"Rating\"],\n", " pred_ratingsonly,\n", " squared=True)),\n", - " np.corrcoef(ratings_test1.Rating, pred_ratingsonly)[0,1]))\n", + " np.corrcoef(ratings_test1[\"Rating\"], pred_ratingsonly)[0,1]))\n", "\n", - "pred_hybrid = m_collective.predict(ratings_test1.UserId, ratings_test1.ItemId)\n", + "pred_hybrid = m_collective.predict(ratings_test1[\"UserId\"], ratings_test1[\"ItemId\"])\n", "print(\"RMSE type 1 hybrid model: %.3f [rho: %.3f]\" %\n", - " (np.sqrt(mean_squared_error(ratings_test1.Rating,\n", + " (np.sqrt(mean_squared_error(ratings_test1[\"Rating\"],\n", " pred_hybrid,\n", " squared=True)),\n", - " np.corrcoef(ratings_test1.Rating, pred_hybrid)[0,1]))\n", + " np.corrcoef(ratings_test1[\"Rating\"], pred_hybrid)[0,1]))\n", "\n", - "test_cb = ratings_test1.loc[ratings_test1.UserId.isin(user_attr_train.UserId) &\n", - " ratings_test1.ItemId.isin(item_attr_train.ItemId)]\n", - "pred_contentbased = m_contentbased.predict(test_cb.UserId, test_cb.ItemId)\n", + "test_cb = ratings_test1.loc[ratings_test1[\"UserId\"].isin(user_attr_train[\"UserId\"]) &\n", + " ratings_test1[\"ItemId\"].isin(item_attr_train[\"ItemId\"])]\n", + "pred_contentbased = m_contentbased.predict(test_cb[\"UserId\"], test_cb[\"ItemId\"])\n", "print(\"RMSE type 1 content-based model: %.3f [rho: %.3f]\" %\n", - " (np.sqrt(mean_squared_error(test_cb.Rating,\n", + " (np.sqrt(mean_squared_error(test_cb[\"Rating\"],\n", " pred_contentbased,\n", " squared=True)),\n", - " np.corrcoef(test_cb.Rating, pred_contentbased)[0,1]))" + " np.corrcoef(test_cb[\"Rating\"], pred_contentbased)[0,1]))" ] }, { @@ -1782,27 +1794,27 @@ "name": "stdout", "output_type": "stream", "text": [ - "RMSE type 2 hybrid model: 1.023 [rho: 0.424]\n", - "RMSE type 2 content-based model: 0.977 [rho: 0.484]\n" + "RMSE type 2 hybrid model: 1.025 [rho: 0.424]\n", + "RMSE type 2 content-based model: 0.977 [rho: 0.486]\n" ] } ], "source": [ - "pred_hybrid = m_collective.predict_new(ratings_test2.UserId,\n", - " item_attr_test.loc[ratings_test2.ItemId])\n", + "pred_hybrid = m_collective.predict_new(ratings_test2[\"UserId\"],\n", + " item_attr_test.loc[ratings_test2[\"ItemId\"]])\n", "print(\"RMSE type 2 hybrid model: %.3f [rho: %.3f]\" %\n", - " (np.sqrt(mean_squared_error(ratings_test2.Rating,\n", + " (np.sqrt(mean_squared_error(ratings_test2[\"Rating\"],\n", " pred_hybrid,\n", " squared=True)),\n", - " np.corrcoef(ratings_test2.Rating, pred_hybrid)[0,1]))\n", + " np.corrcoef(ratings_test2[\"Rating\"], pred_hybrid)[0,1]))\n", "\n", - "pred_contentbased = m_contentbased.predict_new(user_attr_test.loc[ratings_test2.UserId],\n", - " item_attr_test.loc[ratings_test2.ItemId])\n", + "pred_contentbased = m_contentbased.predict_new(user_attr_test.loc[ratings_test2[\"UserId\"]],\n", + " item_attr_test.loc[ratings_test2[\"ItemId\"]])\n", "print(\"RMSE type 2 content-based model: %.3f [rho: %.3f]\" %\n", - " (np.sqrt(mean_squared_error(ratings_test2.Rating,\n", + " (np.sqrt(mean_squared_error(ratings_test2[\"Rating\"],\n", " pred_contentbased,\n", " squared=True)),\n", - " np.corrcoef(ratings_test2.Rating, pred_contentbased)[0,1]))" + " np.corrcoef(ratings_test2[\"Rating\"], pred_contentbased)[0,1]))" ] }, { @@ -1827,21 +1839,21 @@ } ], "source": [ - "pred_hybrid = m_collective.predict_cold_multiple(item=ratings_test3.ItemId,\n", - " U=user_attr_test.loc[ratings_test3.UserId])\n", + "pred_hybrid = m_collective.predict_cold_multiple(item=ratings_test3[\"ItemId\"],\n", + " U=user_attr_test.loc[ratings_test3[\"UserId\"]])\n", "print(\"RMSE type 3 hybrid model: %.3f [rho: %.3f]\" %\n", - " (np.sqrt(mean_squared_error(ratings_test3.Rating,\n", + " (np.sqrt(mean_squared_error(ratings_test3[\"Rating\"],\n", " pred_hybrid,\n", " squared=True)),\n", - " np.corrcoef(ratings_test3.Rating, pred_hybrid)[0,1]))\n", + " np.corrcoef(ratings_test3[\"Rating\"], pred_hybrid)[0,1]))\n", "\n", - "pred_contentbased = m_contentbased.predict_new(user_attr_test.loc[ratings_test3.UserId],\n", - " item_attr_test.loc[ratings_test3.ItemId])\n", + "pred_contentbased = m_contentbased.predict_new(user_attr_test.loc[ratings_test3[\"UserId\"]],\n", + " item_attr_test.loc[ratings_test3[\"ItemId\"]])\n", "print(\"RMSE type 3 content-based model: %.3f [rho: %.3f]\" %\n", - " (np.sqrt(mean_squared_error(ratings_test3.Rating,\n", + " (np.sqrt(mean_squared_error(ratings_test3[\"Rating\"],\n", " pred_contentbased,\n", " squared=True)),\n", - " np.corrcoef(ratings_test3.Rating, pred_contentbased)[0,1]))" + " np.corrcoef(ratings_test3[\"Rating\"], pred_contentbased)[0,1]))" ] }, { @@ -1860,18 +1872,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "RMSE type 4 content-based model: 0.986 [rho: 0.462]\n" + "RMSE type 4 content-based model: 0.986 [rho: 0.464]\n" ] } ], "source": [ - "pred_contentbased = m_contentbased.predict_new(user_attr_test.loc[ratings_test4.UserId],\n", - " item_attr_test.loc[ratings_test4.ItemId])\n", + "pred_contentbased = m_contentbased.predict_new(user_attr_test.loc[ratings_test4[\"UserId\"]],\n", + " item_attr_test.loc[ratings_test4[\"ItemId\"]])\n", "print(\"RMSE type 4 content-based model: %.3f [rho: %.3f]\" %\n", - " (np.sqrt(mean_squared_error(ratings_test4.Rating,\n", + " (np.sqrt(mean_squared_error(ratings_test4[\"Rating\"],\n", " pred_contentbased,\n", " squared=True)),\n", - " np.corrcoef(ratings_test4.Rating, pred_contentbased)[0,1]))" + " np.corrcoef(ratings_test4[\"Rating\"], pred_contentbased)[0,1]))" ] }, { @@ -1913,10 +1925,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "RMSE type 1 ratings-only model: 0.897 [rho: 0.603]\n", - "RMSE type 1 ratings + implicit + dyn + Chol: 0.853 [rho: 0.647]\n", - "RMSE type 1 hybrid model: 0.860 [rho: 0.641]\n", - "RMSE type 1 hybrid + implicit + dyn + Chol: 0.847 [rho: 0.653]\n" + "RMSE type 1 ratings-only model: 0.896 [rho: 0.603]\n", + "RMSE type 1 ratings + implicit + dyn + Chol: 0.853 [rho: 0.646]\n", + "RMSE type 1 hybrid model: 0.861 [rho: 0.640]\n", + "RMSE type 1 hybrid + implicit + dyn + Chol: 0.846 [rho: 0.654]\n" ] } ], @@ -1933,35 +1945,35 @@ " U=user_attr_train,\n", " I=item_attr_train)\n", "\n", - "pred_ratingsonly = m_classic.predict(ratings_test1.UserId, ratings_test1.ItemId)\n", + "pred_ratingsonly = m_classic.predict(ratings_test1[\"UserId\"], ratings_test1[\"ItemId\"])\n", "print(\"RMSE type 1 ratings-only model: %.3f [rho: %.3f]\" %\n", - " (np.sqrt(mean_squared_error(ratings_test1.Rating,\n", + " (np.sqrt(mean_squared_error(ratings_test1[\"Rating\"],\n", " pred_ratingsonly,\n", " squared=True)),\n", - " np.corrcoef(ratings_test1.Rating, pred_ratingsonly)[0,1]))\n", + " np.corrcoef(ratings_test1[\"Rating\"], pred_ratingsonly)[0,1]))\n", "\n", - "pred_implicit = m_implicit.predict(ratings_test1.UserId, ratings_test1.ItemId)\n", + "pred_implicit = m_implicit.predict(ratings_test1[\"UserId\"], ratings_test1[\"ItemId\"])\n", "print(\"RMSE type 1 ratings + implicit + dyn + Chol: %.3f [rho: %.3f]\" %\n", - " (np.sqrt(mean_squared_error(ratings_test1.Rating,\n", + " (np.sqrt(mean_squared_error(ratings_test1[\"Rating\"],\n", " pred_implicit,\n", " squared=True)),\n", - " np.corrcoef(ratings_test1.Rating, pred_implicit)[0,1]))\n", + " np.corrcoef(ratings_test1[\"Rating\"], pred_implicit)[0,1]))\n", "\n", - "pred_hybrid = m_collective.predict(ratings_test1.UserId, ratings_test1.ItemId)\n", + "pred_hybrid = m_collective.predict(ratings_test1[\"UserId\"], ratings_test1[\"ItemId\"])\n", "print(\"RMSE type 1 hybrid model: %.3f [rho: %.3f]\" %\n", - " (np.sqrt(mean_squared_error(ratings_test1.Rating,\n", + " (np.sqrt(mean_squared_error(ratings_test1[\"Rating\"],\n", " pred_hybrid,\n", " squared=True)),\n", - " np.corrcoef(ratings_test1.Rating, pred_hybrid)[0,1]))\n", + " np.corrcoef(ratings_test1[\"Rating\"], pred_hybrid)[0,1]))\n", "\n", "\n", "pred_implicit_plus_collective = m_implicit_plus_collective.\\\n", - " predict(ratings_test1.UserId, ratings_test1.ItemId)\n", + " predict(ratings_test1[\"UserId\"], ratings_test1[\"ItemId\"])\n", "print(\"RMSE type 1 hybrid + implicit + dyn + Chol: %.3f [rho: %.3f]\" %\n", - " (np.sqrt(mean_squared_error(ratings_test1.Rating,\n", + " (np.sqrt(mean_squared_error(ratings_test1[\"Rating\"],\n", " pred_implicit_plus_collective,\n", " squared=True)),\n", - " np.corrcoef(ratings_test1.Rating, pred_implicit_plus_collective)[0,1]))" + " np.corrcoef(ratings_test1[\"Rating\"], pred_implicit_plus_collective)[0,1]))" ] }, { @@ -1980,38 +1992,38 @@ "name": "stdout", "output_type": "stream", "text": [ - "RMSE type 2 hybrid model: 1.023 [rho: 0.424]\n", - "RMSE type 2 hybrid model + implicit + dyn + Chol: 0.999 [rho: 0.490] (might get worse)\n", - "RMSE type 2 content-based model: 0.977 [rho: 0.484]\n" + "RMSE type 2 hybrid model: 1.025 [rho: 0.424]\n", + "RMSE type 2 hybrid model + implicit + dyn + Chol: 1.004 [rho: 0.480] (might get worse)\n", + "RMSE type 2 content-based model: 0.977 [rho: 0.486]\n" ] } ], "source": [ - "pred_hybrid = m_collective.predict_new(ratings_test2.UserId,\n", - " item_attr_test.loc[ratings_test2.ItemId])\n", + "pred_hybrid = m_collective.predict_new(ratings_test2[\"UserId\"],\n", + " item_attr_test.loc[ratings_test2[\"ItemId\"]])\n", "print(\"RMSE type 2 hybrid model: %.3f [rho: %.3f]\" %\n", - " (np.sqrt(mean_squared_error(ratings_test2.Rating,\n", + " (np.sqrt(mean_squared_error(ratings_test2[\"Rating\"],\n", " pred_hybrid,\n", " squared=True)),\n", - " np.corrcoef(ratings_test2.Rating, pred_hybrid)[0,1]))\n", + " np.corrcoef(ratings_test2[\"Rating\"], pred_hybrid)[0,1]))\n", "\n", "pred_implicit_plus_collective = \\\n", " m_implicit_plus_collective\\\n", - " .predict_new(ratings_test2.UserId,\n", - " item_attr_test.loc[ratings_test2.ItemId])\n", + " .predict_new(ratings_test2[\"UserId\"],\n", + " item_attr_test.loc[ratings_test2[\"ItemId\"]])\n", "print(\"RMSE type 2 hybrid model + implicit + dyn + Chol: %.3f [rho: %.3f] (might get worse)\" %\n", - " (np.sqrt(mean_squared_error(ratings_test2.Rating,\n", + " (np.sqrt(mean_squared_error(ratings_test2[\"Rating\"],\n", " pred_implicit_plus_collective,\n", " squared=True)),\n", - " np.corrcoef(ratings_test2.Rating, pred_implicit_plus_collective)[0,1]))\n", + " np.corrcoef(ratings_test2[\"Rating\"], pred_implicit_plus_collective)[0,1]))\n", "\n", - "pred_contentbased = m_contentbased.predict_new(user_attr_test.loc[ratings_test2.UserId],\n", - " item_attr_test.loc[ratings_test2.ItemId])\n", + "pred_contentbased = m_contentbased.predict_new(user_attr_test.loc[ratings_test2[\"UserId\"]],\n", + " item_attr_test.loc[ratings_test2[\"ItemId\"]])\n", "print(\"RMSE type 2 content-based model: %.3f [rho: %.3f]\" %\n", - " (np.sqrt(mean_squared_error(ratings_test2.Rating,\n", + " (np.sqrt(mean_squared_error(ratings_test2[\"Rating\"],\n", " pred_contentbased,\n", " squared=True)),\n", - " np.corrcoef(ratings_test2.Rating, pred_contentbased)[0,1]))" + " np.corrcoef(ratings_test2[\"Rating\"], pred_contentbased)[0,1]))" ] }, { @@ -2024,38 +2036,38 @@ "output_type": "stream", "text": [ "RMSE type 3 hybrid model: 0.988 [rho: 0.470]\n", - "RMSE type 3 hybrid model + implicit + dyn + Chol: 1.014 [rho: 0.457] (got worse)\n", + "RMSE type 3 hybrid model + implicit + dyn + Chol: 1.013 [rho: 0.458] (got worse)\n", "RMSE type 3 content-based model: 0.981 [rho: 0.468]\n" ] } ], "source": [ - "pred_hybrid = m_collective.predict_cold_multiple(item=ratings_test3.ItemId,\n", - " U=user_attr_test.loc[ratings_test3.UserId])\n", + "pred_hybrid = m_collective.predict_cold_multiple(item=ratings_test3[\"ItemId\"],\n", + " U=user_attr_test.loc[ratings_test3[\"UserId\"]])\n", "print(\"RMSE type 3 hybrid model: %.3f [rho: %.3f]\" %\n", - " (np.sqrt(mean_squared_error(ratings_test3.Rating,\n", + " (np.sqrt(mean_squared_error(ratings_test3[\"Rating\"],\n", " pred_hybrid,\n", " squared=True)),\n", - " np.corrcoef(ratings_test3.Rating, pred_hybrid)[0,1]))\n", + " np.corrcoef(ratings_test3[\"Rating\"], pred_hybrid)[0,1]))\n", "\n", "\n", "pred_implicit_plus_collective = \\\n", " m_implicit_plus_collective\\\n", - " .predict_cold_multiple(item=ratings_test3.ItemId,\n", - " U=user_attr_test.loc[ratings_test3.UserId])\n", + " .predict_cold_multiple(item=ratings_test3[\"ItemId\"],\n", + " U=user_attr_test.loc[ratings_test3[\"UserId\"]])\n", "print(\"RMSE type 3 hybrid model + implicit + dyn + Chol: %.3f [rho: %.3f] (got worse)\" %\n", - " (np.sqrt(mean_squared_error(ratings_test3.Rating,\n", + " (np.sqrt(mean_squared_error(ratings_test3[\"Rating\"],\n", " pred_implicit_plus_collective,\n", " squared=True)),\n", - " np.corrcoef(ratings_test3.Rating, pred_implicit_plus_collective)[0,1]))\n", + " np.corrcoef(ratings_test3[\"Rating\"], pred_implicit_plus_collective)[0,1]))\n", "\n", - "pred_contentbased = m_contentbased.predict_new(user_attr_test.loc[ratings_test3.UserId],\n", - " item_attr_test.loc[ratings_test3.ItemId])\n", + "pred_contentbased = m_contentbased.predict_new(user_attr_test.loc[ratings_test3[\"UserId\"]],\n", + " item_attr_test.loc[ratings_test3[\"ItemId\"]])\n", "print(\"RMSE type 3 content-based model: %.3f [rho: %.3f]\" %\n", - " (np.sqrt(mean_squared_error(ratings_test3.Rating,\n", + " (np.sqrt(mean_squared_error(ratings_test3[\"Rating\"],\n", " pred_contentbased,\n", " squared=True)),\n", - " np.corrcoef(ratings_test3.Rating, pred_contentbased)[0,1]))" + " np.corrcoef(ratings_test3[\"Rating\"], pred_contentbased)[0,1]))" ] }, { @@ -2075,7 +2087,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "base", "language": "python", "name": "python3" }, @@ -2089,7 +2101,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.11.0" } }, "nbformat": 4, diff --git a/example/load_data.ipynb b/example/load_data.ipynb index 017b83a..4decacf 100644 --- a/example/load_data.ipynb +++ b/example/load_data.ipynb @@ -98,8 +98,11 @@ "source": [ "import numpy as np, pandas as pd, re\n", "\n", - "ratings = pd.read_table('ml-1m/ratings.dat', sep='::',\n", - " engine='python', names=['UserId','ItemId','Rating','Timestamp'])\n", + "ratings = pd.read_table(\n", + " 'ml-1m/ratings.dat',\n", + " sep='::', engine='python',\n", + " names=['UserId','ItemId','Rating','Timestamp']\n", + ")\n", "ratings = ratings.drop(\"Timestamp\", axis=1)\n", "ratings.head()" ] @@ -120,9 +123,9 @@ } ], "source": [ - "print(\"Number of users: %d\" % ratings.UserId.drop_duplicates().count())\n", - "print(\"Number of items: %d\" % ratings.ItemId.drop_duplicates().count())\n", - "print(\"Number of ratings: %d\" % ratings.Rating.count())" + "print(\"Number of users: %d\" % ratings[\"UserId\"].nunique())\n", + "print(\"Number of items: %d\" % ratings[\"ItemId\"].nunique())\n", + "print(\"Number of ratings: %d\" % ratings[\"Rating\"].count())" ] }, { @@ -207,9 +210,11 @@ } ], "source": [ - "movie_titles = pd.read_table('ml-1m/movies.dat',\n", - " sep='::', engine='python', header=None)\n", - "movie_titles.columns = ['ItemId', 'title', 'genres']\n", + "movie_titles = pd.read_table(\n", + " 'ml-1m/movies.dat',\n", + " sep='::', engine='python', header=None, encoding='latin_1',\n", + " names=['ItemId', 'title', 'genres']\n", + ")\n", "movie_titles = movie_titles[['ItemId', 'title']]\n", "\n", "movie_titles.head()" @@ -221,7 +226,7 @@ "metadata": {}, "outputs": [], "source": [ - "movie_id_to_title = {i.ItemId:i.title for i in movie_titles.itertuples()}" + "movie_id_to_title = {i.ItemId: i.title for i in movie_titles.itertuples()}" ] }, { @@ -444,7 +449,7 @@ "\n", "tags = pd.read_csv('ml-25m/genome-scores.csv')\n", "tags_wide = tags.pivot(index='movieId', columns='tagId', values='relevance')\n", - "tags_wide.columns=[\"tag\"+str(i) for i in tags_wide.columns.values]\n", + "tags_wide.columns=[\"tag\"+str(i) for i in tags_wide.columns]\n", "\n", "item_side_info = pd.merge(movies, tags_wide, how='inner', left_on='movieId', right_index=True)\n", "item_side_info = item_side_info.drop('movieId', axis=1)\n", @@ -485,7 +490,6 @@ " \n", " \n", " ItemId\n", - " pc0\n", " pc1\n", " pc2\n", " pc3\n", @@ -494,8 +498,8 @@ " pc6\n", " pc7\n", " pc8\n", + " pc9\n", " ...\n", - " pc40\n", " pc41\n", " pc42\n", " pc43\n", @@ -505,128 +509,129 @@ " pc47\n", " pc48\n", " pc49\n", + " pc50\n", " \n", " \n", " \n", " \n", " 0\n", " 1\n", - " 1.192433\n", - " 2.034965\n", - " 2.679781\n", - " 1.154823\n", - " 0.715302\n", - " 0.982528\n", - " 1.251208\n", - " -0.792800\n", - " 1.605826\n", + " 1.193171\n", + " 2.085621\n", + " 2.634135\n", + " 1.156088\n", + " 0.721649\n", + " 0.995436\n", + " 1.250474\n", + " -0.779532\n", + " 1.616702\n", " ...\n", - " -0.322325\n", - " -0.082968\n", - " -0.031470\n", - " -0.220287\n", - " 0.207028\n", - " 0.044198\n", - " 0.273854\n", - " -0.209990\n", - " 0.035795\n", - " -0.159606\n", + " -0.317134\n", + " -0.070338\n", + " -0.019553\n", + " 0.169051\n", + " 0.201415\n", + " -0.094831\n", + " -0.250461\n", + " -0.149919\n", + " -0.031735\n", + " -0.177708\n", " \n", " \n", " 1\n", " 2\n", - " -1.333200\n", - " 1.719346\n", - " 1.383137\n", - " 0.788332\n", - " -0.487431\n", - " 0.376546\n", - " 0.803104\n", - " -0.606602\n", - " 0.914494\n", + " -1.333533\n", + " 1.743796\n", + " 1.352161\n", + " 0.795724\n", + " -0.484175\n", + " 0.380645\n", + " 0.804462\n", + " -0.598527\n", + " 0.917250\n", " ...\n", - " 0.278489\n", - " -0.293607\n", - " 0.028680\n", - " -0.030128\n", - " 0.311445\n", - " 0.353925\n", - " -0.318455\n", - " 0.098478\n", - " -0.078716\n", - " 0.049872\n", + " 0.300060\n", + " -0.261956\n", + " 0.054457\n", + " 0.003863\n", + " 0.304605\n", + " -0.315796\n", + " 0.360203\n", + " 0.152770\n", + " 0.144790\n", + " -0.096549\n", " \n", " \n", " 2\n", " 3\n", - " -1.363421\n", - " -0.034093\n", - " 0.528633\n", - " -0.312122\n", - " 0.468820\n", - " 0.164593\n", - " 0.021909\n", - " 0.161554\n", - " -0.231992\n", + " -1.363395\n", + " -0.017107\n", + " 0.530395\n", + " -0.316202\n", + " 0.469430\n", + " 0.164630\n", + " 0.019083\n", + " 0.159188\n", + " -0.232969\n", " ...\n", - " 0.217242\n", - " -0.103778\n", - " -0.290084\n", - " -0.033624\n", - " 0.076278\n", - " 0.224247\n", - " 0.159550\n", - " -0.091091\n", - " -0.134674\n", - " -0.193942\n", + " 0.215020\n", + " -0.060682\n", + " -0.280852\n", + " 0.001087\n", + " 0.084960\n", + " -0.257190\n", + " -0.136963\n", + " -0.113914\n", + " 0.128352\n", + " -0.203658\n", " \n", " \n", " 3\n", " 4\n", - " -1.238094\n", - " -1.014399\n", - " 0.790394\n", - " -0.296004\n", - " -0.095043\n", - " -0.052266\n", - " -0.180244\n", - " -0.768811\n", - " -0.400559\n", + " -1.237840\n", + " -0.993731\n", + " 0.809815\n", + " -0.303009\n", + " -0.088991\n", + " -0.049621\n", + " -0.179544\n", + " -0.771278\n", + " -0.400499\n", " ...\n", - " 0.073494\n", - " 0.037196\n", - " -0.225767\n", - " -0.398071\n", - " 0.275756\n", - " -0.335302\n", - " 0.254760\n", - " -0.136116\n", - " -0.462383\n", - " 0.485561\n", + " 0.066207\n", + " 0.056054\n", + " -0.223027\n", + " 0.400157\n", + " 0.292300\n", + " 0.260936\n", + " -0.307608\n", + " -0.224141\n", + " 0.488955\n", + " 0.439189\n", " \n", " \n", " 4\n", " 5\n", - " -1.613220\n", - " -0.280142\n", - " 1.119149\n", - " -0.130238\n", - " 0.397091\n", - " 0.187158\n", - " 0.108864\n", - " -0.273748\n", - " -0.260166\n", + " -1.611499\n", + " -0.251899\n", + " 1.126443\n", + " -0.135702\n", + " 0.403340\n", + " 0.187289\n", + " 0.108451\n", + " -0.275341\n", + " -0.261142\n", " ...\n", - " 0.113957\n", - " -0.123240\n", - " -0.243951\n", - " -0.489377\n", - " -0.024730\n", - " 0.095848\n", - " 0.227061\n", - " -0.296050\n", - " -0.030863\n", - " -0.072919\n", + " 0.109560\n", + " -0.086042\n", + " -0.236327\n", + " 0.461589\n", + " 0.013350\n", + " -0.192557\n", + " -0.234025\n", + " -0.369643\n", + " -0.041060\n", + " -0.074656\n", " \n", " \n", "\n", @@ -634,26 +639,26 @@ "" ], "text/plain": [ - " ItemId pc0 pc1 pc2 pc3 pc4 pc5 \\\n", - "0 1 1.192433 2.034965 2.679781 1.154823 0.715302 0.982528 \n", - "1 2 -1.333200 1.719346 1.383137 0.788332 -0.487431 0.376546 \n", - "2 3 -1.363421 -0.034093 0.528633 -0.312122 0.468820 0.164593 \n", - "3 4 -1.238094 -1.014399 0.790394 -0.296004 -0.095043 -0.052266 \n", - "4 5 -1.613220 -0.280142 1.119149 -0.130238 0.397091 0.187158 \n", + " ItemId pc1 pc2 pc3 pc4 pc5 pc6 \\\n", + "0 1 1.193171 2.085621 2.634135 1.156088 0.721649 0.995436 \n", + "1 2 -1.333533 1.743796 1.352161 0.795724 -0.484175 0.380645 \n", + "2 3 -1.363395 -0.017107 0.530395 -0.316202 0.469430 0.164630 \n", + "3 4 -1.237840 -0.993731 0.809815 -0.303009 -0.088991 -0.049621 \n", + "4 5 -1.611499 -0.251899 1.126443 -0.135702 0.403340 0.187289 \n", "\n", - " pc6 pc7 pc8 ... pc40 pc41 pc42 pc43 \\\n", - "0 1.251208 -0.792800 1.605826 ... -0.322325 -0.082968 -0.031470 -0.220287 \n", - "1 0.803104 -0.606602 0.914494 ... 0.278489 -0.293607 0.028680 -0.030128 \n", - "2 0.021909 0.161554 -0.231992 ... 0.217242 -0.103778 -0.290084 -0.033624 \n", - "3 -0.180244 -0.768811 -0.400559 ... 0.073494 0.037196 -0.225767 -0.398071 \n", - "4 0.108864 -0.273748 -0.260166 ... 0.113957 -0.123240 -0.243951 -0.489377 \n", + " pc7 pc8 pc9 ... pc41 pc42 pc43 pc44 \\\n", + "0 1.250474 -0.779532 1.616702 ... -0.317134 -0.070338 -0.019553 0.169051 \n", + "1 0.804462 -0.598527 0.917250 ... 0.300060 -0.261956 0.054457 0.003863 \n", + "2 0.019083 0.159188 -0.232969 ... 0.215020 -0.060682 -0.280852 0.001087 \n", + "3 -0.179544 -0.771278 -0.400499 ... 0.066207 0.056054 -0.223027 0.400157 \n", + "4 0.108451 -0.275341 -0.261142 ... 0.109560 -0.086042 -0.236327 0.461589 \n", "\n", - " pc44 pc45 pc46 pc47 pc48 pc49 \n", - "0 0.207028 0.044198 0.273854 -0.209990 0.035795 -0.159606 \n", - "1 0.311445 0.353925 -0.318455 0.098478 -0.078716 0.049872 \n", - "2 0.076278 0.224247 0.159550 -0.091091 -0.134674 -0.193942 \n", - "3 0.275756 -0.335302 0.254760 -0.136116 -0.462383 0.485561 \n", - "4 -0.024730 0.095848 0.227061 -0.296050 -0.030863 -0.072919 \n", + " pc45 pc46 pc47 pc48 pc49 pc50 \n", + "0 0.201415 -0.094831 -0.250461 -0.149919 -0.031735 -0.177708 \n", + "1 0.304605 -0.315796 0.360203 0.152770 0.144790 -0.096549 \n", + "2 0.084960 -0.257190 -0.136963 -0.113914 0.128352 -0.203658 \n", + "3 0.292300 0.260936 -0.307608 -0.224141 0.488955 0.439189 \n", + "4 0.013350 -0.192557 -0.234025 -0.369643 -0.041060 -0.074656 \n", "\n", "[5 rows x 51 columns]" ] @@ -670,10 +675,12 @@ "item_sideinfo_reduced = item_side_info.drop(\"ItemId\", axis=1)\n", "item_sideinfo_pca = pca_obj.fit_transform(item_sideinfo_reduced)\n", "\n", - "item_sideinfo_pca = pd.DataFrame(item_sideinfo_pca)\n", - "item_sideinfo_pca.columns = [\"pc\"+str(i) for i in range(item_sideinfo_pca.shape[1])]\n", - "item_sideinfo_pca['ItemId'] = item_side_info.ItemId.values.copy()\n", - "item_sideinfo_pca = item_sideinfo_pca[[\"ItemId\"] + [cl for cl in item_sideinfo_pca.columns if cl != \"ItemId\"]]\n", + "item_sideinfo_pca = pd.DataFrame(\n", + " item_sideinfo_pca,\n", + " columns=[\"pc\"+str(i+1) for i in range(item_sideinfo_pca.shape[1])]\n", + ")\n", + "item_sideinfo_pca['ItemId'] = item_side_info[\"ItemId\"].to_numpy()\n", + "item_sideinfo_pca = item_sideinfo_pca[[\"ItemId\"] + item_sideinfo_pca.columns[:50].tolist()]\n", "item_sideinfo_pca.head()" ] }, @@ -686,13 +693,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "Number of items from MovieLens 1M with side info: 3065\n" + "Number of items from MovieLens 1M with side info: 3080\n" ] } ], "source": [ "print(\"Number of items from MovieLens 1M with side info: %d\" %\n", - " ratings.ItemId[np.in1d(ratings.ItemId, item_sideinfo_pca.ItemId)].drop_duplicates().count())" + " ratings[\"ItemId\"][np.in1d(ratings[\"ItemId\"], item_sideinfo_pca[\"ItemId\"])].nunique())" ] }, { @@ -709,7 +716,7 @@ "outputs": [], "source": [ "zipcode_abbs = pd.read_csv(\"states.csv\", low_memory=False)\n", - "zipcode_abbs_dct = {z.State:z.Abbreviation for z in zipcode_abbs.itertuples()}\n", + "zipcode_abbs_dct = {z.State: z.Abbreviation for z in zipcode_abbs.itertuples()}\n", "us_regs_table = [\n", " ('New England', 'Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island, Vermont'),\n", " ('Middle Atlantic', 'Delaware, Maryland, New Jersey, New York, Pennsylvania'),\n", @@ -809,12 +816,14 @@ "source": [ "zipcode_info = pd.read_csv(\"free-zipcode-database.csv\", low_memory=False)\n", "zipcode_info = zipcode_info.groupby('Zipcode').first().reset_index()\n", - "zipcode_info.loc[zipcode_info.Country != \"US\", 'State'] = 'UnknownOrNonUS'\n", + "zipcode_info.loc[lambda x: x[\"Country\"] != \"US\", 'State'] = 'UnknownOrNonUS'\n", "zipcode_info['Region'] = zipcode_info['State'].copy()\n", - "zipcode_info.loc[zipcode_info.Country == \"US\", 'Region'] = \\\n", - " zipcode_info.Region\\\n", - " .loc[zipcode_info.Country == \"US\"]\\\n", - " .map(lambda x: us_regs_dct[x] if x in us_regs_dct else 'UsOther')\n", + "zipcode_info.loc[lambda x: x[\"Country\"] == \"US\", \"Region\"] = (\n", + " zipcode_info\n", + " .loc[lambda x: x[\"Country\"] == \"US\"]\n", + " [\"Region\"]\n", + " .map(lambda x: us_regs_dct[x] if x in us_regs_dct else 'UsOther')\n", + ")\n", "zipcode_info = zipcode_info[['Zipcode', 'Region']]\n", "zipcode_info.head()" ] @@ -925,11 +934,14 @@ } ], "source": [ - "users = pd.read_table('ml-1m/users.dat',\n", - " sep='::', names=[\"UserId\", \"Gender\", \"Age\", \"Occupation\", \"Zipcode\"], engine='python')\n", - "users[\"Zipcode\"] = users.Zipcode.map(lambda x: np.int(re.sub(\"-.*\",\"\",x)))\n", - "users = pd.merge(users,zipcode_info,on='Zipcode',how='left')\n", - "users['Region'] = users.Region.fillna('UnknownOrNonUS')\n", + "users = pd.read_table(\n", + " 'ml-1m/users.dat',\n", + " sep='::', engine='python', encoding='cp1252',\n", + " names=[\"UserId\", \"Gender\", \"Age\", \"Occupation\", \"Zipcode\"]\n", + ")\n", + "users[\"Zipcode\"] = users[\"Zipcode\"].map(lambda x: int(re.sub(\"-.*\", \"\", x)))\n", + "users = pd.merge(users, zipcode_info, on='Zipcode', how='left')\n", + "users['Region'] = users[\"Region\"].fillna('UnknownOrNonUS')\n", "\n", "occupations = {\n", " 0: \"\\\"other\\\" or not specified\",\n", @@ -954,8 +966,8 @@ " 19: \"unemployed\",\n", " 20: \"writer\"\n", "}\n", - "users['Occupation'] = users.Occupation.map(occupations)\n", - "users['Age'] = users.Age.map(lambda x: str(x))\n", + "users['Occupation'] = users[\"Occupation\"].map(occupations)\n", + "users['Age'] = users[\"Age\"].map(lambda x: str(x))\n", "users.head()" ] }, @@ -1012,122 +1024,122 @@ " \n", " 0\n", " 1\n", - " 1\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " True\n", + " False\n", + " True\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", " ...\n", - " 0\n", - " 0\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " False\n", + " False\n", + " False\n", + " True\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " 1\n", " 2\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 1\n", + " False\n", + " True\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " True\n", " ...\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " True\n", + " False\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " 2\n", " 3\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " False\n", + " True\n", + " False\n", + " False\n", + " True\n", + " False\n", + " False\n", + " False\n", + " False\n", " ...\n", - " 0\n", - " 0\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " False\n", + " False\n", + " False\n", + " True\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " 3\n", " 4\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", + " False\n", + " True\n", + " False\n", + " False\n", + " False\n", + " False\n", + " True\n", + " False\n", + " False\n", " ...\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " False\n", + " False\n", + " False\n", + " False\n", + " True\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " 4\n", " 5\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " False\n", + " True\n", + " False\n", + " False\n", + " True\n", + " False\n", + " False\n", + " False\n", + " False\n", " ...\n", - " 0\n", - " 1\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " False\n", + " True\n", + " False\n", + " True\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", " \n", " \n", "\n", @@ -1136,32 +1148,32 @@ ], "text/plain": [ " UserId Gender_F Gender_M Age_1 Age_18 Age_25 Age_35 Age_45 Age_50 \\\n", - "0 1 1 0 1 0 0 0 0 0 \n", - "1 2 0 1 0 0 0 0 0 0 \n", - "2 3 0 1 0 0 1 0 0 0 \n", - "3 4 0 1 0 0 0 0 1 0 \n", - "4 5 0 1 0 0 1 0 0 0 \n", + "0 1 True False True False False False False False \n", + "1 2 False True False False False False False False \n", + "2 3 False True False False True False False False \n", + "3 4 False True False False False False True False \n", + "4 5 False True False False True False False False \n", "\n", " Age_56 ... Occupation_unemployed Occupation_writer \\\n", - "0 0 ... 0 0 \n", - "1 1 ... 0 0 \n", - "2 0 ... 0 0 \n", - "3 0 ... 0 0 \n", - "4 0 ... 0 1 \n", + "0 False ... False False \n", + "1 True ... False False \n", + "2 False ... False False \n", + "3 False ... False False \n", + "4 False ... False True \n", "\n", " Region_Middle Atlantic Region_Midwest Region_New England Region_South \\\n", - "0 0 1 0 0 \n", - "1 0 0 0 1 \n", - "2 0 1 0 0 \n", - "3 0 0 1 0 \n", - "4 0 1 0 0 \n", + "0 False True False False \n", + "1 False False False True \n", + "2 False True False False \n", + "3 False False True False \n", + "4 False True False False \n", "\n", " Region_Southwest Region_UnknownOrNonUS Region_UsOther Region_West \n", - "0 0 0 0 0 \n", - "1 0 0 0 0 \n", - "2 0 0 0 0 \n", - "3 0 0 0 0 \n", - "4 0 0 0 0 \n", + "0 False False False False \n", + "1 False False False False \n", + "2 False False False False \n", + "3 False False False False \n", + "4 False False False False \n", "\n", "[5 rows x 39 columns]" ] @@ -1191,7 +1203,7 @@ ], "source": [ "print(\"Number of users with demographic information: %d\" %\n", - " user_side_info.UserId.drop_duplicates().count())" + " user_side_info[\"UserId\"].nunique())" ] }, { @@ -1232,7 +1244,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.5" + "version": "3.11.0" } }, "nbformat": 4, diff --git a/requirements.txt b/requirements.txt index 9295cf0..4206eea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ cython -numpy>=1.18.1 +numpy>=1.25 scipy -pandas>=0.25.0 +pandas threadpoolctl diff --git a/setup.py b/setup.py index 3c0d46e..9b1981c 100644 --- a/setup.py +++ b/setup.py @@ -22,6 +22,7 @@ def set_omp_false(): if len(custom_blas_link_args) or len(custom_blas_compile_args): build_ext_with_blas = build_ext +use_findblas = False if not (len(custom_blas_link_args) or len(custom_blas_compile_args)): use_findblas = (("findblas" in sys.argv) or ("-findblas" in sys.argv) @@ -45,6 +46,18 @@ def set_omp_false(): except AttributeError: EXIT_SUCCESS = 0 +## For debugging +if "--asan" in sys.argv: + ADD_ASAN = True + sys.argv.remove("--asan") +else: + ADD_ASAN = False +if "--ggdb" in sys.argv: + ADD_GGDB = True + sys.argv.remove("--ggdb") +else: + ADD_GGDB = False + class build_ext_subclass( build_ext_with_blas ): def build_extensions(self): is_windows = sys.platform[:3].lower() == "win" @@ -69,16 +82,25 @@ def build_extensions(self): if is_windows: e.define_macros += [("NO_LONG_DOUBLE", None)] - - # e.extra_compile_args += ['-O2', '-fopenmp', '-march=native', '-std=c99', '-ggdb'] - # e.extra_link_args += ['-fopenmp'] - # e.extra_link_args += ['-fopenmp=libomp'] - # e.extra_compile_args += ['-O2', '-march=native', '-std=c99', '-ggdb'] - + if ADD_ASAN: + for e in self.extensions: + if self.compiler.compiler_type != "clang": + e.extra_compile_args += ["-fsanitize=address", "-static-libasan", "-ggdb"] + else: + e.extra_compile_args += ["-fsanitize=address", "-static-libsan", "-ggdb"] + + elif ADD_GGDB: + for e in self.extensions: + e.extra_compile_args += ["-ggdb"] + e.define_macros += [("DEBUG", 1)] - # e.extra_compile_args += ['-fsanitize=address', '-static-libasan', '-ggdb'] - # e.extra_link_args += ['-fsanitize=address', '-static-libasan'] + if self.compiler.compiler_type == "clang": + e.extra_compile_args += [ + "-Wno-unknown-pragmas", + "-Wno-unknown-attributes", + "-Wno-pass-failed", + ] ## If a custom BLAS/LAPACK is provided: if len(custom_blas_link_args) or len(custom_blas_compile_args): @@ -348,7 +370,7 @@ def test_supports_clang_reassociate(self): setup( name = "cmfrec", packages = ["cmfrec"], - version = '3.5.1-7', + version = '3.5.1-8', description = 'Collective matrix factorization', author = 'David Cortes', url = 'https://github.com/david-cortes/cmfrec', @@ -356,9 +378,9 @@ def test_supports_clang_reassociate(self): 'relational learning'], install_requires=[ 'cython', - 'numpy>=1.17', + 'numpy>=1.25', 'scipy', - 'pandas>=0.25.0', + 'pandas', 'findblas' ], cmdclass = {'build_ext': build_ext_subclass},