diff --git a/.gitignore b/.gitignore
index 778fecb..6bca713 100644
--- a/.gitignore
+++ b/.gitignore
@@ -46,3 +46,10 @@ test_math/*.c
test_math/failing.py
test_math/run
.Rproj.user
+
+example/*.p
+example/*.csv
+example/ml*
+example/ml-1m/*
+example/ml-25m/*
+example/cmfrec/*
diff --git a/cmfrec/__init__.py b/cmfrec/__init__.py
index fdc7191..e70a18b 100644
--- a/cmfrec/__init__.py
+++ b/cmfrec/__init__.py
@@ -1,6 +1,6 @@
from . import wrapper_double, wrapper_float
import numpy as np, pandas as pd
-from scipy.sparse import coo_matrix, csr_matrix, csc_matrix, issparse, isspmatrix_coo, isspmatrix_csr, isspmatrix_csc
+from scipy.sparse import csr_array, csc_array, issparse
import multiprocessing
import ctypes
import warnings
@@ -10,6 +10,13 @@
"MostPopular", "ContentBased",
"CMF_imputer"]
+def _is_csr(x):
+ return issparse(x) and (x.format == "csr")
+def _is_csc(x):
+ return issparse(x) and (x.format == "csc")
+def _is_coo(x):
+ return issparse(x) and (x.format == "coo")
+
### TODO: this module should move from doing operations in Python to
### using the new designated C functions for each type of prediction.
@@ -67,7 +74,7 @@ def _take_params(self, implicit=False, alpha=40., downweight=False,
maxiter=400, niter=10, parallelize="separate", corr_pairs=4,
NA_as_zero=False, NA_as_zero_user=False, NA_as_zero_item=False,
precompute_for_predictions=True, use_float=False,
- random_state=1, verbose=True,
+ random_state=1, verbose=False,
print_every=10, handle_interrupt=True,
produce_dicts=False, nthreads=-1, n_jobs=None):
assert method in ["als", "lbfgs"]
@@ -88,21 +95,18 @@ def _take_params(self, implicit=False, alpha=40., downweight=False,
if ((max(k_user, k_item) + k + k_main + max(user_bias, item_bias))**2) > np.iinfo(ctypes.c_int).max:
raise ValueError("Number of factors is too large.")
+ dtype = ctypes.c_float if use_float else ctypes.c_double
lambda_ = float(lambda_) if isinstance(lambda_, int) else lambda_
- if isinstance(lambda_, (list, tuple, pd.Series)):
- lambda_ = np.array(lambda_)
- if isinstance(lambda_, np.ndarray):
- lambda_ = lambda_.reshape(-1)
+ if not isinstance(lambda_, float):
+ lambda_ = np.require(lambda_, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
assert lambda_.shape[0] == 6
assert np.all(lambda_ >= 0.)
else:
assert isinstance(lambda_, float) and lambda_ >= 0.
l1_lambda = float(l1_lambda) if isinstance(l1_lambda, int) else l1_lambda
- if isinstance(l1_lambda, (list, tuple, pd.Series)):
- l1_lambda = np.array(l1_lambda)
- if isinstance(l1_lambda, np.ndarray):
- l1_lambda = l1_lambda.reshape(-1)
+ if not isinstance(l1_lambda, float):
+ l1_lambda = np.require(l1_lambda, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
assert l1_lambda.shape[0] == 6
assert np.all(l1_lambda >= 0.)
else:
@@ -122,6 +126,8 @@ def _take_params(self, implicit=False, alpha=40., downweight=False,
nthreads = multiprocessing.cpu_count() + 1 + nthreads
if nthreads is None:
nthreads = 1
+ if isinstance(nthreads, float):
+ nthreads = int(nthreads)
assert isinstance(nthreads, int) and nthreads > 0
if (nthreads > 1) and (not wrapper_double._get_has_openmp()):
@@ -247,11 +253,9 @@ def _take_params(self, implicit=False, alpha=40., downweight=False,
self._k_main_col = self.k_main
if isinstance(self.lambda_, np.ndarray):
- if self.lambda_.dtype != self.dtype_:
- self.lambda_ = self.lambda_.astype(self.dtype_)
+ self.lambda_ = np.require(self.lambda_, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
if isinstance(self.l1_lambda, np.ndarray):
- if self.l1_lambda.dtype != self.dtype_:
- self.l1_lambda = self.l1_lambda.astype(self.dtype_)
+ self.l1_lambda = np.require(self.l1_lambda, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
self._reset()
@@ -337,26 +341,18 @@ def _take_params_offsets(self, k_sec=0, k_main=0, add_intercepts=True):
def _append_NAs(self, U, m_u, p, append_U):
U_new = np.repeat(np.nan, m_u*p).reshape((m_u, p))
- if U_new.dtype != self.dtype_:
- U_new = U_new.astype(U.dtype)
- if not U_new.flags["C_CONTIGUOUS"]:
- U_new = np.ascontiguousarray(U_new)
+ U_new = np.require(U_new, dtype=self.dtype_, requirements=["C_CONTIGUOUS", "ENSUREARRAY"])
U_new[np.setdiff1d(np.arange(m_u), append_U), :] = U
if U_new.dtype != self.dtype_:
- U_new = U_new.astype(U.dtype)
+ U_new = np.require(U_new, dtype=self.dtype_, requirements=["C_CONTIGUOUS", "ENSUREARRAY"])
return U_new
def _decompose_coo(self, X):
- row = X.row
- col = X.col
- val = X.data
- if row.dtype != ctypes.c_int:
- row = row.astype(ctypes.c_int)
- if col.dtype != ctypes.c_int:
- col = col.astype(ctypes.c_int)
- if val.dtype != self.dtype_:
- val = val.astype(self.dtype_)
- return row, col, val
+ return(
+ np.require(X.row, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]),
+ np.require(X.col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]),
+ np.require(X.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]),
+ )
def _process_U_arr(self, U):
Urow = np.empty(0, dtype=ctypes.c_int)
@@ -366,20 +362,16 @@ def _process_U_arr(self, U):
Ucols = np.empty(0, dtype=object)
m = 0
p = 0
- if issparse(U) and not isspmatrix_coo(U):
+ if issparse(U) and not (U.format == "coo"):
U = U.tocoo()
- if isspmatrix_coo(U):
+ if _is_coo(U):
Urow, Ucol, Uval = self._decompose_coo(U)
m, p = U.shape
elif U is not None:
if isinstance(U, pd.DataFrame):
- Ucols = U.columns.to_numpy()
- U = U.to_numpy()
- if not U.flags["C_CONTIGUOUS"]:
- U = np.ascontiguousarray(U)
- if U.dtype != self.dtype_:
- U = U.astype(self.dtype_)
- Uarr = U
+ Ucols = U.columns.to_numpy(copy=True)
+ U = U.to_numpy(copy=False, dtype=self.dtype_)
+ Uarr = np.require(U, dtype=self.dtype_, requirements=["C_CONTIGUOUS", "ENSUREARRAY"])
m, p = Uarr.shape
return Urow, Ucol, Uval, Uarr, Ucols, m, p
@@ -391,18 +383,23 @@ def _convert_ids(self, X, U, U_bin, col="UserId"):
append_Ub = np.empty(0, dtype=object)
msg = "'X' and side info have no IDs in common."
if (U is not None) and (U_bin is not None):
- user_ids1 = np.intersect1d(U[col].to_numpy(), X[col].to_numpy())
- user_ids2 = np.intersect1d(U_bin[col].to_numpy(), X[col].to_numpy())
- user_ids3 = np.intersect1d(U_bin[col].to_numpy(), U[col].to_numpy())
+ Xcol = X[col].to_numpy(copy=False)
+ Ucol = U[col].to_numpy(copy=False)
+ Ubcol = U_bin[col].to_numpy(copy=False)
+
+
+ user_ids1 = np.intersect1d(Ucol, Xcol)
+ user_ids2 = np.intersect1d(Ubcol, Xcol)
+ user_ids3 = np.intersect1d(Ubcol, Ucol)
if (user_ids1.shape[0] == 0) and (user_ids2.shape[0] == 0):
raise ValueError(msg)
user_ids = np.intersect1d(user_ids1, user_ids2)
- u_not_x = np.setdiff1d(U[col].to_numpy(), X[col].to_numpy())
- x_not_u = np.setdiff1d(X[col].to_numpy(), U[col].to_numpy())
- b_not_x = np.setdiff1d(U_bin[col].to_numpy(), X[col].to_numpy())
- x_not_b = np.setdiff1d(X[col].to_numpy(), U_bin[col].to_numpy())
- b_not_u = np.setdiff1d(U_bin[col].to_numpy(), U[col].to_numpy())
- u_not_b = np.setdiff1d(U[col].to_numpy(), U_bin[col].to_numpy())
+ u_not_x = np.setdiff1d(Ucol, Xcol)
+ x_not_u = np.setdiff1d(Xcol, Ucol)
+ b_not_x = np.setdiff1d(Ubcol, Xcol)
+ x_not_b = np.setdiff1d(Xcol, Ubcol)
+ b_not_u = np.setdiff1d(Ubcol, Ucol)
+ u_not_b = np.setdiff1d(Ucol, Ubcol)
### There can be cases in which the sets are disjoint,
### and will need to add NAs to one of the inputs.
@@ -415,32 +412,44 @@ def _convert_ids(self, X, U, U_bin, col="UserId"):
user_ids = user_ids
else:
if u_not_b.shape[0] >= b_not_u.shape[0]:
- user_ids = np.r_[user_ids, user_ids1, X[col].to_numpy(), user_ids3, U[col].to_numpy(), U_bin[col].to_numpy()]
+ user_ids = np.r_[user_ids, user_ids1, Xcol, user_ids3, Ucol, Ubcol]
append_U = x_not_u
append_Ub = np.r_[x_not_b, u_not_b]
else:
- user_ids = np.r_[user_ids, user_ids2, X[col].to_numpy(), user_ids3, U_bin[col].to_numpy(), U[col].to_numpy()]
+ user_ids = np.r_[user_ids, user_ids2, Xcol, user_ids3, Ubcol, Ucol]
append_U = np.r_[x_not_u, b_not_u]
append_Ub = x_not_b
+ # TODO: move away from pandas for these operations
_, user_mapping_ = pd.factorize(user_ids)
- X = X.assign(**{col : pd.Categorical(X[col], user_mapping_).codes})
- if X[col].dtype != ctypes.c_int:
- X = X.assign(**{col : X[col].astype(ctypes.c_int)})
- U = U.assign(**{col : pd.Categorical(U[col], user_mapping_).codes})
- if U[col].dtype != ctypes.c_int:
- U = U.assign(**{col : U[col].astype(ctypes.c_int)})
- U_bin = U_bin.assign(**{col : pd.Categorical(U_bin[col], user_mapping_).codes})
- if U_bin[col].dtype != ctypes.c_int:
- U_bin = U_bin.assign(**{col : U_bin[col].astype(ctypes.c_int)})
+ X = X.assign(**{
+ col : pd.Categorical(Xcol, user_mapping_, copy=False).codes.astype(ctypes.c_int)
+ })
+ U = U.assign(**{
+ col : pd.Categorical(Ucol, user_mapping_, copy=False).codes.astype(ctypes.c_int)
+ })
+ U_bin = U_bin.assign(**{
+ col : pd.Categorical(Ubcol, user_mapping_, copy=False).codes.astype(ctypes.c_int)
+ })
+ user_mapping_ = np.require(user_mapping_, requirements=["ENSUREARRAY"]),reshape(-1)
if append_U.shape[0]:
- append_U = pd.Categorical(np.unique(append_U), user_mapping_).codes.astype(ctypes.c_int)
+ append_U = pd.Categorical(
+ np.unique(append_U),
+ user_mapping_,
+ copy=False,
+ ).codes
append_U = np.sort(append_U)
+ append_U = np.require(append_U, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
if append_Ub.shape[0]:
- append_Ub = pd.Categorical(np.unique(append_Ub), user_mapping_).codes.astype(ctypes.c_int)
+ append_Ub = pd.Categorical(
+ np.unique(append_Ub),
+ user_mapping_,
+ copy=False,
+ ).codes
append_Ub = np.sort(append_Ub)
+ append_Ub = np.require(append_Ub, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
else:
if (U is None) and (U_bin is not None):
@@ -448,12 +457,15 @@ def _convert_ids(self, X, U, U_bin, col="UserId"):
swapped = True
if (U is not None):
- user_ids = np.intersect1d(U[col].to_numpy(), X[col].to_numpy())
+ Xcol = X[col].to_numpy(copy=False)
+ Ucol = U[col].to_numpy(copy=False)
+
+ user_ids = np.intersect1d(Ucol, Xcol)
if user_ids.shape[0] == 0:
raise ValueError(msg)
- u_not_x = np.setdiff1d(U[col].to_numpy(), X[col].to_numpy())
- x_not_u = np.setdiff1d(X[col].to_numpy(), U[col].to_numpy())
+ u_not_x = np.setdiff1d(Ucol, Xcol)
+ x_not_u = np.setdiff1d(Xcol, Ucol)
if (u_not_x.shape[0]) or (x_not_u.shape[0]):
### Case0: both have the same entries
### This is the ideal situation
@@ -469,29 +481,40 @@ def _convert_ids(self, X, U, U_bin, col="UserId"):
user_ids = np.r_[user_ids, u_not_x]
### Case3: both have IDs that the others don't
else:
- user_ids = np.r_[user_ids, X[col].to_numpy(), U[col].to_numpy()]
+ user_ids = np.r_[user_ids, Xcol, Ucol]
append_U = x_not_u
_, user_mapping_ = pd.factorize(user_ids)
- if not isinstance(user_mapping_, np.ndarray):
- user_mapping_ = user_mapping_.to_numpy()
- X = X.assign(**{col : pd.Categorical(X[col], user_mapping_).codes})
- if X[col].dtype != ctypes.c_int:
- X = X.assign(**{col : X[col].astype(ctypes.c_int)})
- U = U.assign(**{col : pd.Categorical(U[col], user_mapping_).codes})
- if U[col].dtype != ctypes.c_int:
- U = U.assign(**{col : U[col].astype(ctypes.c_int)})
+ X = X.assign(**{
+ col : pd.Categorical(
+ Xcol, user_mapping_, copy=False
+ )
+ .codes
+ .astype(dtype=ctypes.c_int)
+ })
+ U = U.assign(**{
+ col : pd.Categorical(
+ Ucol, user_mapping_, copy=False
+ )
+ .codes
+ .astype(dtype=ctypes.c_int)
+ })
if append_U.shape[0]:
- append_U = pd.Categorical(append_U, user_mapping_).codes.astype(ctypes.c_int)
+ append_U = pd.Categorical(
+ append_U, user_mapping_, copy=False
+ ).codes
append_U = np.sort(append_U)
+ append_U = np.require(append_U, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
+ user_mapping_ = np.require(user_mapping_, requirements=["ENSUREARRAY"]).reshape(-1)
else:
- X_col, user_mapping_ = pd.factorize(X[col].to_numpy())
- X = X.assign(**{col : X_col})
+ Xcol = X[col].to_numpy(copy=False)
+ Xcol, user_mapping_ = pd.factorize(Xcol)
+ Xcol = np.require(Xcol, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
+ X = X.assign(**{col : Xcol})
if X[col].dtype != ctypes.c_int:
X = X.assign(**{col : X[col].astype(ctypes.c_int)})
- if not isinstance(user_mapping_, np.ndarray):
- user_mapping_ = user_mapping_.to_numpy()
+ user_mapping_ = np.require(user_mapping_, requirements=["ENSUREARRAY"]).reshape(-1)
if swapped:
U, U_bin = U_bin, U
@@ -508,25 +531,22 @@ def _process_U_df(self, U, is_I=False, df_name="U"):
m = 0
p = 0
if U is not None:
- if "ColumnId" in U.columns.values:
- Urow = U[cl_take].astype(ctypes.c_int).to_numpy()
- Ucol = U.ColumnId.astype(ctypes.c_int).to_numpy()
- if "Value" not in U.columns.values:
+ if "ColumnId" in U.columns:
+ Urow = U[cl_take].to_numpy(copy=False, dtype=ctypes.c_int)
+ Ucol = U["ColumnId"].to_numpy(copy=False, dtype=ctypes.c_int)
+ if "Value" not in U.columns:
msg = "If passing sparse '%s', must have column 'Value'."
msg = msg % df_name
raise ValueError(msg)
- Uval = U.Value.astype(self.dtype_).to_numpy()
+ Uval = U["Value"].to_numpy(copy=False, dtype=self.dtype_)
m = int(Urow.max() + 1)
p = int(Ucol.max() + 1)
else:
U = U.sort_values(cl_take)
- Uarr = U[[cl for cl in U.columns.values if cl != cl_take]]
- Ucols = Uarr.columns.to_numpy()
- Uarr = Uarr.to_numpy()
- if not Uarr.flags["C_CONTIGUOUS"]:
- Uarr = np.ascontiguousarray(Uarr)
- if Uarr.dtype != self.dtype_:
- Uarr = Uarr.astype(self.dtype_)
+ Uarr = U[[cl for cl in U.columns if cl != cl_take]]
+ Ucols = Uarr.columns.to_numpy(copy=True)
+ Uarr = Uarr.to_numpy(copy=False, dtype=self.dtype_)
+ Uarr = np.require(Uarr, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
m, p = Uarr.shape
return Urow, Ucol, Uval, Uarr, Ucols, m, p
@@ -556,7 +576,7 @@ def _process_new_U(self, U, U_col, U_val, U_bin, is_I=False):
raise ValueError("Model was not fit to %s data." % name)
if isinstance(U, pd.DataFrame) and Cols.shape[0]:
U = U[Cols]
- U = np.array(U).reshape(-1).astype(self.dtype_)
+ U = np.require(U, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if U.shape[0] != Mat.shape[0]:
raise ValueError("Dimensions of %s don't match with earlier data."
% letter)
@@ -568,7 +588,7 @@ def _process_new_U(self, U, U_col, U_val, U_bin, is_I=False):
raise ValueError("Model was not fit to %s binary data." % name)
if isinstance(U_bin, pd.DataFrame) and (ColsBin.shape[0]):
U_bin = U_bin[ColsBin]
- U_bin = np.array(U_bin).reshape(-1).astype(self.dtype_)
+ U_bin = np.require(U_bin, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if U_bin.shape[0] != MatBin.shape[0]:
raise ValueError("Dimensions of %s_bin don't match with earlier data."
% letter)
@@ -576,12 +596,22 @@ def _process_new_U(self, U, U_col, U_val, U_bin, is_I=False):
U_bin = np.empty(0, dtype=self.dtype_)
###
if U_col is not None:
+ U_val = np.require(
+ U_val,
+ dtype=self.dtype_,
+ requirements=["ENSUREARRAY", "C_CONTIGUOUS"]
+ ).reshape(-1)
+ U_col = np.require(
+ U_col,
+ dtype=ctypes.c_int if not not self.reindex_ else None,
+ requirements=["ENSUREARRAY", "C_CONTIGUOUS"]
+ ).reshape(-1)
+ if U_val.shape[0] != U_col.shape[0]:
+ raise ValueError("'%s_col' and '%s_val' must have the same number of entries." % (letter, letter))
+
if Mat.shape[0] == 0:
raise ValueError("Model was not fit to %s data." % name)
- U_val = np.array(U_val).reshape(-1).astype(self.dtype_)
if U_val.shape[0] == 0:
- if np.array(U_col).shape[0] > 0:
- raise ValueError("'%s_col' and '%s_val' must have the same number of entries." % (letter, letter))
U_col = np.empty(0, dtype=ctypes.c_int)
U_val = np.empty(0, dtype=self.dtype_)
else:
@@ -592,12 +622,13 @@ def _process_new_U(self, U, U_col, U_val, U_bin, is_I=False):
except Exception:
raise ValueError("Sparse inputs cannot contain missing values.")
else:
- U_col = pd.Categorical(U_col, mapping).codes.astype(ctypes.c_int)
+ U_col = pd.Categorical(U_col, mapping).codes
+ U_col = np.require(U_col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
if np.any(U_col < 0):
raise ValueError("Sparse inputs cannot contain missing values.")
- U_col = U_col.astype(ctypes.c_int)
+ U_col = np.require(U_col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
else:
- U_col = np.array(U_col).reshape(-1).astype(ctypes.c_int)
+ U_col = np.require(U_col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
imin, imax = U_col.min(), U_col.max()
if np.isnan(imin) or np.isnan(imax):
raise ValueError("Sparse inputs cannot contain missing values.")
@@ -633,37 +664,31 @@ def _process_new_U_2d(self, U, is_I=False, allow_csr=False):
msg += "as the data passed to 'fit'."
raise ValueError(msg % letter)
- if issparse(U) and (not isspmatrix_coo(U)) and (not isspmatrix_csr(U)):
- U = U.tocoo()
- elif isspmatrix_csr(U) and not allow_csr:
- U = U.tocoo()
+ if issparse(U):
+ if (U.format not in ["coo", "csr"]):
+ U = U.tocoo()
+ elif (U.format == "csr") and not allow_csr:
+ U = U.tocoo()
if isinstance(U, pd.DataFrame):
- if col_id in U.columns.values:
+ if col_id in U.columns:
warnings.warn("'%s' not meaningful for new inputs." % col_id)
if Cols.shape[0]:
U = U[Cols]
- Uarr = U.to_numpy()
- Uarr = np.ascontiguousarray(Uarr)
- if Uarr.dtype != self.dtype_:
- Uarr = Uarr.astype(self.dtype_)
-
- elif isspmatrix_coo(U):
- Urow = U.row.astype(ctypes.c_int)
- Ucol = U.col.astype(ctypes.c_int)
- Uval = U.data.astype(self.dtype_)
- elif isspmatrix_csr(U):
+ Uarr = np.require(U, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
+
+ elif _is_coo(U):
+ Urow = np.require(U.row, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
+ Ucol = np.require(U.col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
+ Uval = np.require(U.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
+ elif _is_csr(U):
if not allow_csr:
raise ValueError("Unexpected error.")
- Ucsr_p = U.indptr.astype(ctypes.c_size_t)
- Ucsr_i = U.indices.astype(ctypes.c_int)
- Ucsr = U.data.astype(self.dtype_)
+ Ucsr_p = np.require(U.indptr, dtype=ctypes.c_size_t, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
+ Ucsr_i = np.require(U.indices, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
+ Ucsr = np.require(U.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
elif isinstance(U, np.ndarray):
- if not U.flags["C_CONTIGUOUS"]:
- U = np.ascontiguousarray(U)
- if U.dtype != self.dtype_:
- U = U.astype(self.dtype_)
- Uarr = U
+ Uarr = np.require(U, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
elif U is None:
pass
else:
@@ -694,19 +719,13 @@ def _process_new_Ub_2d(self, U_bin, is_I=False):
raise ValueError(msg % letter)
if isinstance(U_bin, pd.DataFrame):
- if col_id in U_bin.columns.values:
+ if col_id in U_bin.columns:
warnings.warn("'%s' not meaningful for new inputs." % col_id)
if Cols.shape[0]:
U_bin = U_bin[Cols]
- Ub_arr = U_bin.to_numpy()
- Ub_arr = np.ascontiguousarray(Ub_arr)
- if Ub_arr.dtype != self.dtype_:
- Ub_arr = Ub_arr.astype(self.dtype_)
+ Ub_arr = np.require(U_bin, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
elif isinstance(Ub_arr, np.ndarray):
- if not Ub_arr.flags["C_CONTIGUOUS"]:
- Ub_arr = np.ascontiguousarray(Ub_arr)
- if Ub_arr.dtype != self.dtype_:
- Ub_arr = Ub_arr.astype(self.dtype_)
+ Ub_arr = np.require(Ub_arr, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
elif Ub_arr is None:
pass
else:
@@ -730,62 +749,54 @@ def _process_new_X_2d(self, X, W=None):
W_sp = np.empty(0, dtype=self.dtype_)
m, n = X.shape
- if issparse(X) and (not isspmatrix_coo(X)) and (not isspmatrix_csr(X)):
+ # TODO: why is this needed? should it error out with CSC or is it somehow used internally?
+ if issparse(X) and (not (X.format == "coo")) and (not (X.format == "csr")):
if (W is not None) and (not issparse(W)):
- if not isinstance(W, np.ndarray):
- W = np.array(W).reshape(-1)
- if W.shape[0] != X.nnz:
- raise ValueError("'X' and 'W' have different number of entries.")
- if isspmatrix_csc(X):
- W = csc_matrix((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1]))
+ W = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
+ if W.shape[0] != X.data.shape[0]:
+ raise ValueError("'X' and 'W' have different number of entries.")
+ if (X.format == "csc"):
+ W = csc_array((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1]), dtype=self.dtype_)
W = W.tocoo()
else:
raise ValueError("Must pass 'X' as SciPy sparse COO if there are weights.")
X = X.tocoo()
- if issparse(W) and (not isspmatrix_coo(W)) and (not isspmatrix_csr(W)):
+ if issparse(W) and (W.format not in ["coo", "csr"]):
W = W.tocoo()
- if (isspmatrix_coo(X) != isspmatrix_coo(W)):
- if not isspmatrix_coo(X):
+ if issparse(X) and issparse(W) and ((X.format == "coo") != (W.format == "coo")):
+ if not _is_coo(X):
X = X.tocoo()
- if not isspmatrix_coo(W):
+ if not _is_coo(W):
W = W.tocoo()
if issparse(W):
- W = W.data
+ W = np.require(W.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
- if isspmatrix_coo(X):
- Xrow = X.row.astype(ctypes.c_int)
- Xcol = X.col.astype(ctypes.c_int)
- Xval = X.data.astype(self.dtype_)
+ if _is_coo(X):
+ Xrow = np.require(X.row, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
+ Xcol = np.require(X.col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
+ Xval = np.require(X.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
if W is not None:
- W_sp = np.array(W).reshape(-1).astype(self.dtype_)
+ W_sp = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if W_sp.shape[0] != Xval.shape[0]:
msg = "'W' must have the same number of non-zero entries "
msg += "as 'X'."
raise ValueError(msg)
- elif isspmatrix_csr(X):
- Xcsr_p = X.indptr.astype(ctypes.c_size_t)
- Xcsr_i = X.indices.astype(ctypes.c_int)
- Xcsr = X.data.astype(self.dtype_)
+ elif _is_csr(X):
+ Xcsr_p = np.require(X.indptr, dtype=ctypes.c_size_t, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
+ Xcsr_i = np.require(X.indices, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
+ Xcsr = np.require(X.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
if W is not None:
- W_sp = np.array(W).reshape(-1).astype(self.dtype_)
+ W_sp = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if W_sp.shape[0] != Xcsr.shape[0]:
msg = "'W' must have the same number of non-zero entries "
msg += "as 'X'."
raise ValueError(msg)
elif isinstance(X, np.ndarray):
- if not X.flags["C_CONTIGUOUS"]:
- X = np.ascontiguousarray(X)
- if X.dtype != self.dtype_:
- X = X.astype(self.dtype_)
- Xarr = X
+ Xarr = np.require(X, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
if W is not None:
assert W.shape[0] == X.shape[0]
assert W.shape[1] == X.shape[1]
- if not W.flags["C_CONTIGUOUS"]:
- W = np.ascontiguousarray(W)
- if W.dtype != self.dtype_:
- W = W.astype(self.dtype_)
- W_dense = W
+ W_dense = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
else:
raise ValueError("'X' must be a SciPy CSR or COO matrix, or NumPy array.")
@@ -804,30 +815,28 @@ def _process_users_items(self, user, item, include, exclude, allows_no_item=True
raise ValueError("'include' and 'exclude' should not contain missing values.")
if include is not None and exclude is not None:
raise ValueError("Cannot pass 'include' and 'exclude' together.")
- include = np.array(include).reshape(-1) if include is not None \
- else np.empty(0, dtype=ctypes.c_int)
- exclude = np.array(exclude).reshape(-1) if exclude is not None \
- else np.empty(0, dtype=ctypes.c_int)
-
- if isinstance(user, (list, tuple)) :
- user = np.array(user)
- if isinstance(item, (list, tuple)):
- item = np.array(item)
- if isinstance(user, pd.Series):
- user = user.to_numpy()
- if isinstance(item, pd.Series):
- item = item.to_numpy()
+
+ if include is not None:
+ include = np.require(include, requirements=["ENSUREARRAY"]).reshape(-1)
+ else:
+ include = np.empty(0, dtype=ctypes.c_int)
+ if exclude is not None:
+ exclude = np.require(exclude, requirements=["ENSUREARRAY"]).reshape(-1)
+ else:
+ exclude = np.empty(0, dtype=ctypes.c_int)
+
+ if not np.isscalar(user):
+ user = np.require(user, requirements=["ENSUREARRAY"]).reshape(-1)
+ if not np.isscalar(item):
+ item = np.require(item, requirements=["ENSUREARRAY"]).reshape(-1)
if user is not None:
if isinstance(user, np.ndarray):
- if len(user.shape) > 1:
- user = user.reshape(-1)
assert user.shape[0] > 0
if self.reindex_:
if user.shape[0] > 1:
user = pd.Categorical(user, self.user_mapping_).codes
- if user.dtype != ctypes.c_int:
- user = user.astype(ctypes.c_int)
+ user = np.require(user, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
else:
if len(self.user_dict_):
try:
@@ -850,14 +859,11 @@ def _process_users_items(self, user, item, include, exclude, allows_no_item=True
if item is not None:
if isinstance(item, np.ndarray):
- if len(item.shape) > 1:
- item = item.reshape(-1)
assert item.shape[0] > 0
if self.reindex_:
if item.shape[0] > 1:
item = pd.Categorical(item, self.item_mapping_).codes
- if item.dtype != ctypes.c_int:
- item = item.astype(ctypes.c_int)
+ item = np.require(item, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
else:
if len(self.item_dict_):
try:
@@ -893,9 +899,7 @@ def _process_users_items(self, user, item, include, exclude, allows_no_item=True
if np.any(include < 0):
raise ValueError(msg % "include")
- if include.dtype != ctypes.c_int:
- include = include.astype(ctypes.c_int)
- include = include.reshape(-1)
+ include = np.require(include, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if exclude.shape[0]:
if len(self.item_dict_):
try:
@@ -904,36 +908,31 @@ def _process_users_items(self, user, item, include, exclude, allows_no_item=True
raise ValueError(msg % "exclude")
else:
exclude = pd.Categorical(exclude, self.item_mapping_).codes
- if exclude.dtype != ctypes.c_int:
- exclude = exclude.astype(ctypes.c_int)
if np.any(exclude < 0):
raise ValueError(msg % "exclude")
- if exclude.dtype != ctypes.c_int:
- exclude = exclude.astype(ctypes.c_int)
- exclude = exclude.reshape(-1)
+ exclude = np.require(exclude, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
else:
msg = "'%s' entries must be within the range of the %s (%s)"
msg += " of the data that was passed to 'fit'."
if include.shape[0]:
+ include = np.require(include, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
imin, imax = include.min(), include.max()
if (imin < 0) or (imax >= self._B_pred.shape[0]):
raise ValueError(msg % ("include", "items", "columns"))
if exclude.shape[0]:
+ exclude = np.require(exclude, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
emin, emax = exclude.min(), exclude.max()
if (emin < 0) or (emax >= self._B_pred.shape[0]):
raise ValueError(msg % ("exclude", "items", "columns"))
if user is not None:
- user = user.astype(ctypes.c_int)
+ user = np.require(user, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if item is not None:
- item = item.astype(ctypes.c_int)
- if include.dtype != ctypes.c_int:
- include = include.astype(ctypes.c_int)
- if exclude.dtype != ctypes.c_int:
- exclude = exclude.astype(ctypes.c_int)
-
+ item = np.require(item, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
+ include = np.require(include, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
+ exclude = np.require(exclude, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
return user, item, include, exclude
@@ -945,29 +944,27 @@ def _fit_common(self, X, U=None, I=None, U_bin=None, I_bin=None, W=None,
self._reset()
- if issparse(X) and (not isspmatrix_coo(X)):
+ if issparse(X) and (not (X.format == "coo")):
if (W is not None) and (not issparse(W)):
- if isspmatrix_csr(X):
- if not isinstance(W, np.ndarray):
- W = np.array(W).reshape(-1)
- if W.shape[0] != X.nnz:
+ if (X.format == "csr"):
+ W = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
+ if W.shape[0] != X.data.shape[0]:
raise ValueError("'X' and 'W' have different number of entries.")
- W = csr_matrix((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1]))
+ W = csr_array((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1]), dtype=self.dtype_)
W = W.tocoo()
- elif isspmatrix_csc(X):
- if not isinstance(W, np.ndarray):
- W = np.array(W).reshape(-1)
- if W.shape[0] != X.nnz:
+ elif (X.format == "csc"):
+ W = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
+ if W.shape[0] != X.data.shape[0]:
raise ValueError("'X' and 'W' have different number of entries.")
- W = csc_matrix((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1]))
+ W = csc_array((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1]), dtype=self.dtype_)
W = W.tocoo()
else:
raise ValueError("Must pass 'X' as SciPy COO if passing weights.")
X = X.tocoo()
- if issparse(W) and (not isspmatrix_coo(W)):
+ if issparse(W) and (not (W.format == "coo")):
W = W.tocoo()
if issparse(W):
- W = W.data
+ W = np.require(W.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
if isinstance(X, pd.DataFrame):
msg = "If passing 'X' as DataFrame, '%s' must also be a DataFrame."
@@ -985,45 +982,36 @@ def _fit_common(self, X, U=None, I=None, U_bin=None, I_bin=None, W=None,
msg += "called 'Weight'."
raise ValueError(msg)
- assert "UserId" in X.columns.values
- assert "ItemId" in X.columns.values
- if (self._implicit) and ("Rating" in X.columns.values) and ("Value" not in X.columns.values):
- X = X.rename(columns={"Rating":"Value"}, copy=False, inplace=False)
+ assert "UserId" in X.columns
+ assert "ItemId" in X.columns
+ if (self._implicit) and ("Rating" in X.columns) and ("Value" not in X.columns):
+ X = X.rename(columns={"Rating":"Value"}, copy=False)
if self._implicit:
- assert "Value" in X.columns.values
+ assert "Value" in X.columns
else:
- assert "Rating" in X.columns.values
+ assert "Rating" in X.columns
if U is not None:
- assert "UserId" in U.columns.values
+ assert "UserId" in U.columns
if I is not None:
- assert "ItemId" in I.columns.values
+ assert "ItemId" in I.columns
if U_bin is not None:
- assert "UserId" in U_bin.columns.values
+ assert "UserId" in U_bin.columns
if I_bin is not None:
- assert "ItemId" in I_bin.columns.values
+ assert "ItemId" in I_bin.columns
X, U, U_bin, self.user_mapping_, append_U, append_Ub = self._convert_ids(X, U, U_bin, "UserId")
X, I, I_bin, self.item_mapping_, append_I, append_Ib = self._convert_ids(X, I, I_bin, "ItemId")
- Xrow = X.UserId.to_numpy()
- Xcol = X.ItemId.to_numpy()
- if Xrow.dtype != ctypes.c_int:
- Xrow = Xrow.astype(ctypes.c_int)
- if Xcol.dtype != ctypes.c_int:
- Xcol = Xcol.astype(ctypes.c_int)
- if self._implicit:
- Xval = X.Value.to_numpy()
- else:
- Xval = X.Rating.to_numpy()
- if Xval.dtype != self.dtype_:
- Xval = Xval.astype(self.dtype_)
+ Xrow = X["UserId"].to_numpy(copy=False, dtype=ctypes.c_int)
+ Xcol = X["ItemId"].to_numpy(copy=False, dtype=ctypes.c_int)
+ Xval = X["Value" if self._implicit else "Rating"].to_numpy(copy=False, dtype=self.dtype_)
if Xval.shape[0] == 0:
raise ValueError("'X' contains no non-zero entries.")
Xarr = np.empty((0,0), dtype=self.dtype_)
W_sp = np.empty(0, dtype=self.dtype_)
- if "Weight" in X.columns.values:
- W_sp = X.Weight.astype(self.dtype_).to_numpy()
+ if "Weight" in X.columns:
+ W_sp = X["Weight"].to_numpy(copy=False, dtype=self.dtype_)
W_dense = np.empty((0,0), dtype=self.dtype_)
Urow, Ucol, Uval, Uarr, self._U_cols, m_u, p = self._process_U_df(U, False, "U")
@@ -1037,11 +1025,11 @@ def _fit_common(self, X, U=None, I=None, U_bin=None, I_bin=None, W=None,
qbin = 0
msg = "Binary side info data cannot be passed in sparse format."
if U_bin is not None:
- if "ColumnId" in U_bin.columns.values:
+ if "ColumnId" in U_bin.columns:
raise ValueError(msg)
_1, _2, _3, Ub_arr, self._Ub_cols, m_ub, pbin = self._process_U_df(U_bin, False, "U_bin")
if I_bin is not None:
- if "ColumnId" in I_bin.columns.values:
+ if "ColumnId" in I_bin.columns:
raise ValueError(msg)
_1, _2, _3, Ib_arr, self._Ib_cols, n_ib, qbin = self._process_U_df(I_bin, True, "U_bin")
@@ -1067,28 +1055,28 @@ def _fit_common(self, X, U=None, I=None, U_bin=None, I_bin=None, W=None,
self.user_dict_ = {self.user_mapping_[i]:i for i in range(self.user_mapping_.shape[0])}
self.item_dict_ = {self.item_mapping_[i]:i for i in range(self.item_mapping_.shape[0])}
- elif isspmatrix_coo(X) or isinstance(X, np.ndarray):
- if issparse(U) and not isspmatrix_coo(U):
+ elif _is_coo(X) or isinstance(X, np.ndarray):
+ if issparse(U) and not (U.format == "coo"):
U = U.tocoo()
- if issparse(I) and not isspmatrix_coo(I):
+ if issparse(I) and not (I.format == "coo"):
I = I.tocoo()
msg = " must be a Pandas DataFrame, NumPy array, or SciPy sparse COO matrix."
msg_bin = " must be a Pandas DataFrame or NumPy array."
- if U is not None and not (isinstance(U, (pd.DataFrame, np.ndarray)) or isspmatrix_coo(U)):
+ if U is not None and not (isinstance(U, (pd.DataFrame, np.ndarray)) or _is_coo(U)):
raise ValueError("'U'" + msg)
- if I is not None and not (isinstance(I, (pd.DataFrame, np.ndarray)) or isspmatrix_coo(I)):
+ if I is not None and not (isinstance(I, (pd.DataFrame, np.ndarray)) or _is_coo(I)):
raise ValueError("'I'" + msg)
if U_bin is not None and not isinstance(U_bin, (pd.DataFrame, np.ndarray)):
raise ValueError("'U_bin'" + msg_bin)
if I_bin is not None and not isinstance(I_bin, (pd.DataFrame, np.ndarray)):
raise ValueError("'I_bin'" + msg_bin)
if W is not None:
- if isinstance(W, (list, pd.Series)):
- W = np.array(W)
- if (len(W.shape) > 1) and isspmatrix_coo(X):
+ if not issparse(W):
+ W = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY"])
+ if (len(W.shape) > 1) and _is_coo(X):
W = W.reshape(-1)
if (not isinstance(W, np.ndarray)) or \
- (isspmatrix_coo(X) and W.shape[0] != X.nnz) or\
+ (_is_coo(X) and W.shape[0] != X.data.shape[0]) or\
(isinstance(X, np.ndarray) and (W.shape[0] != X.shape[0] or W.shape[1] != X.shape[1])):
raise ValueError("'W' must be an array with the same number of entries as 'X'.")
@@ -1107,14 +1095,14 @@ def _fit_common(self, X, U=None, I=None, U_bin=None, I_bin=None, W=None,
W_sp = np.empty(0, dtype=self.dtype_)
W_dense = np.empty((0,0), dtype=self.dtype_)
if W is not None:
- if issparse(W) and not isspmatrix_coo(W):
+ if issparse(W) and not (W.format == "coo"):
W = W.tocoo()
if issparse(W):
- W = W.data
- if isspmatrix_coo(X):
+ W = np.require(W.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
+ if _is_coo(X):
W_sp = W.astype(self.dtype_)
else:
- W_dense = W.astype(self.dtype_)
+ W_dense = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
self.reindex_ = False
@@ -1127,7 +1115,7 @@ def _fit_common(self, X, U=None, I=None, U_bin=None, I_bin=None, W=None,
else:
m = int(Xrow.max() + 1)
n = int(Xcol.max() + 1)
- if isspmatrix_coo(X):
+ if _is_coo(X):
m = max(m, X.shape[0])
n = max(n, X.shape[1])
if enforce_same_shape:
@@ -1230,11 +1218,12 @@ def _predict(self, user=None, a_vec=None, a_bias=0., item=None):
if self._only_prediction_info:
raise ValueError("Cannot use this function after dropping non-essential matrices.")
+ user_was_not_None = not (user is None)
user, item, _1, _2 = self._process_users_items(user, item, None, None)
c_funs = wrapper_float if self.use_float else wrapper_double
- if user is not None:
+ if user_was_not_None:
assert user.shape[0] == item.shape[0]
if user.shape[0] == 1:
@@ -1268,8 +1257,8 @@ def _predict(self, user=None, a_vec=None, a_bias=0., item=None):
self.user_bias_,
self.item_bias_,
self.glob_mean_,
- np.array(user).astype(ctypes.c_int),
- np.array(item).astype(ctypes.c_int),
+ np.require(user, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1),
+ np.require(item, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1),
self._k_pred, self.k_user, self.k_item, self._k_main_col,
self.nthreads
)
@@ -1280,15 +1269,15 @@ def _predict(self, user=None, a_vec=None, a_bias=0., item=None):
self.user_bias_,
self.item_bias_,
self.glob_mean_,
- np.array(user).astype(ctypes.c_int),
- np.array(item).astype(ctypes.c_int),
+ np.require(user, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1),
+ np.require(item, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1),
self._k_pred, self.k_user, self.k_item, self._k_main_col,
self.nthreads
)
#### When passing the factors directly
else:
- item = np.array([item]).reshape(-1)
+ item = np.require(item, requirements=["ENSUREARRAY"]).reshape(-1)
nan_entries = (item == -1)
outp = self._B_pred[item, self.k_item:].reshape((item.shape[0],-1)).dot(a_vec[self.k_user:])
outp += a_bias + self.glob_mean_
@@ -1318,7 +1307,7 @@ def _predict_new(self, user, B):
np.zeros(n, dtype=self.dtype_) if self.item_bias \
else np.empty(0, dtype=self.dtype_),
self.glob_mean_,
- np.array(user).astype(ctypes.c_int),
+ np.require(user, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1),
np.arange(n).astype(ctypes.c_int),
self._k_pred, self.k_user, self.k_item, self._k_main_col,
self.nthreads
@@ -1347,7 +1336,7 @@ def _predict_user_multiple(self, A, item, bias=None):
self.item_bias_,
self.glob_mean_,
np.arange(m).astype(ctypes.c_int),
- np.array(item).astype(ctypes.c_int),
+ np.require(item, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1),
self._k_pred, self.k_user, self.k_item, self._k_main_col,
self.nthreads
)
@@ -1359,7 +1348,7 @@ def _predict_user_multiple(self, A, item, bias=None):
self.item_bias_,
self.glob_mean_,
np.arange(m).astype(ctypes.c_int),
- np.array(item).astype(ctypes.c_int),
+ np.require(item, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1),
self._k_pred, self.k_user, self.k_item, self._k_main_col,
self.nthreads
)
@@ -1436,7 +1425,7 @@ def _topN(self, user=None, a_vec=None, a_bias=0, B=None,
msg += "fewer than 'n' to rank."
raise ValueError(msg)
- if user is not None:
+ if (user is not None) and (user.min() >= 0):
user = user[0]
a_vec = self._A_pred[user].reshape(-1)
user_bias_ = 0.
@@ -1587,15 +1576,11 @@ def _factors_warm_common(self, X=None, X_col=None, X_val=None, W=None,
W_sp = np.empty(0, dtype=self.dtype_)
if len(X.shape) > 1:
warnings.warn("Passed a 2-d array for 'X' - method expects a single row.")
- X = np.array(X).reshape(-1)
- if X.dtype != self.dtype_:
- X = X.astype(self.dtype_)
+ X = np.require(X, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if X.shape[0] != self._n_orig:
raise ValueError("'X' must have the same columns as when passed to 'fit'.")
if W is not None:
- W_dense = np.array(W).reshape(-1)
- if W_dense.dtype != self.dtype_:
- W_dense = W_dense.astype(self.dtype_)
+ W_dense = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if W_dense.shape[0] != X.shape[0]:
raise ValueError("'W' must have the same number of entries as X.")
else:
@@ -1603,28 +1588,20 @@ def _factors_warm_common(self, X=None, X_col=None, X_val=None, W=None,
else:
X = np.empty(0, dtype=self.dtype_)
W_dense = np.empty(0, dtype=self.dtype_)
- X_val = np.array(X_val).reshape(-1)
- if X_val.dtype != self.dtype_:
- X_val = X_val.astype(self.dtype_)
+ X_val = np.require(X_val, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if X_val.shape[0] == 0:
- X_col = np.array(X_col).reshape(-1)
- if X_col.dtype != ctypes.c_int:
- X_col = X_col.astype(ctypes.c_int)
+ X_col = np.require(X_col, requirements=["ENSUREARRAY"]).reshape(-1)
if X_col.shape[0] > 0:
raise ValueError("'X_col' and 'X_val' must have the same number of entries.")
else:
if self.reindex_:
- X_col = np.array(X_col).reshape(-1)
X_col = pd.Categorical(X_col, self.item_mapping_).codes
- if X_col.dtype != ctypes.c_int:
- X_col = X_col.astype(ctypes.c_int)
+ X_col = np.require(X_col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if np.any(X_col < 0):
raise ValueError("'X_col' must have the same item/column entries as passed to 'fit'.")
else:
- X_col = np.array(X_col).reshape(-1)
- if X_col.dtype != ctypes.c_int:
- X_col = X_col.astype(ctypes.c_int)
+ X_col = np.require(X_col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
imin, imax = np.min(X_col), np.max(X_col)
if (imin < 0) or (imax >= self._n_orig) or np.isnan(imin) or np.isnan(imax):
msg = "Column indices ('X_col') must be within the range"
@@ -1639,9 +1616,7 @@ def _factors_warm_common(self, X=None, X_col=None, X_val=None, W=None,
raise ValueError("'X' is empty.")
if W is not None:
- W_sp = np.array(W).reshape(-1)
- if W_sp.dtype != self.dtype_:
- W_sp = W_sp.astype(self.dtype_)
+ W_sp = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if W_sp.shape[0] != X_col.shape[0]:
raise ValueError("'W' must have the same number of entries as 'X_val'.")
else:
@@ -1665,26 +1640,25 @@ def _factors_warm_common(self, X=None, X_col=None, X_val=None, W=None,
def _process_transform_inputs(self, X, U, U_bin, W, replace_existing):
if (W is not None) and (issparse(W) != issparse(X)):
raise ValueError("'X' and 'W' must be in the same format.")
- if issparse(X) and not isspmatrix_coo(X):
+ if issparse(X) and not (X.format == "coo"):
if (W is not None) and (not issparse(W)):
- if not isinstance(W, np.ndarray):
- W = np.array(W).reshape(-1)
- if W.shape[0] != X.nnz:
- raise ValueError("'X' and 'W' must have the same number of entries.")
- if isspmatrix_csr(X):
- W = csr_matrix((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1]))
+ W = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
+ if W.shape[0] != X.data.shape[0]:
+ raise ValueError("'X' and 'W' must have the same number of entries.")
+ if _is_csr(X):
+ W = csr_array((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1]), dtype=self.dtype_)
W = W.tocoo()
- elif isspmatrix_csc(X):
- W = csc_matrix((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1]))
+ elif _is_csc(X):
+ W = csc_array((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1]), dtype=self.dtype_)
W = W.tocoo()
else:
raise ValueError("Must pass 'X' as SciPy COO if there are weights.")
X = X.tocoo()
- if issparse(W) and not isspmatrix_coo(W):
+ if issparse(W) and not (W.format == "coo"):
W = W.tocoo()
if issparse(W):
- W = W.data
- if issparse(U) and (not isspmatrix_coo(U)) and (not isspmatrix_csr(U)):
+ W = np.require(W.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
+ if issparse(U) and (U.format not in ["coo", "csr"]):
U = U.tocoo()
if (X is None) and (U is None) and (U_bin is None):
@@ -1697,7 +1671,7 @@ def _process_transform_inputs(self, X, U, U_bin, W, replace_existing):
raise ValueError("Must pass 'X' if not passing 'replace_existing'.")
if isinstance(X, np.ndarray):
mask_take = ~pd.isnull(X)
- elif isspmatrix_coo(X):
+ elif _is_coo(X):
mask_take = np.repeat(False, X.shape[0]*X.shape[1]).reshape((X.shape[0], X.shape[1]))
mask_take[X.row, X.col] = True
else:
@@ -1773,13 +1747,13 @@ def _transform_step(self, A, A_bias, mask_take, Xorig):
if self.item_bias:
outp += self.item_bias_.reshape((1,-1))
- if issparse(Xorig) and not isspmatrix_coo(Xorig):
+ if issparse(Xorig) and not (Xorig.format == "coo"):
Xorig = Xorig.tocoo()
if mask_take is not None:
if isinstance(Xorig, np.ndarray):
outp[mask_take] = Xorig[mask_take]
- elif isspmatrix_coo(X):
+ elif _is_coo(X):
outp[mask_take] = Xorig.data
else:
raise ValueError("'X' must be a SciPy COO matrix or NumPy array.")
@@ -2918,7 +2892,7 @@ def __init__(self, k=40, lambda_=1e+1, method="als", use_cg=True,
nonneg=False, nonneg_C=False, nonneg_D=False, max_cd_steps=100,
precompute_for_predictions=True, include_all_X=True,
use_float=True,
- random_state=1, verbose=True, print_every=10,
+ random_state=1, verbose=False, print_every=10,
handle_interrupt=True, produce_dicts=False,
nthreads=-1, n_jobs=None):
self.k = k
@@ -4316,10 +4290,10 @@ def from_model_matrices(A, B, glob_mean=0., precompute=True,
):
raise ValueError("Must pass both 'scaling_biasA' and 'scaling_biasB'.")
- if (not isinstance(A, np.ndarray)) or (not A.flags["C_CONTIGUOUS"]):
- A = np.ascontiguousarray(A)
- if (not isinstance(B, np.ndarray)) or (not B.flags["C_CONTIGUOUS"]):
- B = np.ascontiguousarray(B)
+ dtype = ctypes.c_double if not use_float else ctypes.c_float
+ A = np.require(A, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
+ B = np.require(B, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
+
if (len(A.shape) != 2) or (len(B.shape) != 2):
raise ValueError("Model matrices must be 2-dimensional.")
@@ -4348,31 +4322,14 @@ def from_model_matrices(A, B, glob_mean=0., precompute=True,
n_jobs = n_jobs)
new_model._init()
- dtype = ctypes.c_double if not use_float else ctypes.c_float
-
if user_bias is not None:
- if not isinstance(user_bias, np.ndarray):
- user_bias = np.array(user_bias).reshape(-1)
+ user_bias = np.require(user_bias, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if user_bias.shape[0] != A.shape[0]:
raise ValueError("'user_bias' dimension does not match with 'A'.")
- if not user_bias.flags["C_CONTIGUOUS"]:
- user_bias = np.ascontiguousarray(user_bias)
- if user_bias.dtype != dtype:
- user_bias = user_bias.astype(dtype)
if item_bias is not None:
- if not isinstance(item_bias, np.ndarray):
- item_bias = np.array(item_bias).reshape(-1)
+ item_bias = np.require(item_bias, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if item_bias.shape[0] != B.shape[0]:
raise ValueError("'item_bias' dimension does not match with 'B'.")
- if not item_bias.flags["C_CONTIGUOUS"]:
- item_bias = np.ascontiguousarray(item_bias)
- if item_bias.dtype != dtype:
- item_bias = item_bias.astype(dtype)
-
- if (A.dtype != dtype):
- A = A.astype(dtype)
- if (B.dtype != dtype):
- B = B.astype(dtype)
new_model.A_ = A
new_model.B_ = B
@@ -4915,9 +4872,9 @@ def fit(self, X, U=None, I=None):
"""
self._init()
- if issparse(X) and not isspmatrix_coo(X):
+ if issparse(X) and not (X.format == "coo"):
X = X.tocoo()
- if not isspmatrix_coo(X) and not isinstance(X, pd.DataFrame):
+ if not _is_coo(X) and not isinstance(X, pd.DataFrame):
raise ValueError("'X' must be a Pandas DataFrame or SciPy sparse COO matrix.")
return self._fit_common(X, U=U, I=I, U_bin=None, I_bin=None, W=None)
@@ -5662,10 +5619,10 @@ def from_model_matrices(A, B, precompute=True,
prediction methods such as ``topN`` and ``topN_warm`` can be used as if
it had been fitted through this software.
"""
- if (not isinstance(A, np.ndarray)) or (not A.flags["C_CONTIGUOUS"]):
- A = np.ascontiguousarray(A)
- if (not isinstance(B, np.ndarray)) or (not B.flags["C_CONTIGUOUS"]):
- B = np.ascontiguousarray(B)
+ dtype = ctypes.c_double if not use_float else ctypes.c_float
+ A = np.require(A, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
+ B = np.require(B, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
+
if (len(A.shape) != 2) or (len(B.shape) != 2):
raise ValueError("Model matrices must be 2-dimensional.")
@@ -5676,8 +5633,7 @@ def from_model_matrices(A, B, precompute=True,
raise ValueError("Empty model matrices not supported.")
- dtype = ctypes.c_double if not use_float else ctypes.c_float
-
+
new_model = CMF_implicit(k = k,
lambda_ = lambda_,
l1_lambda = l1_lambda,
@@ -5689,11 +5645,6 @@ def from_model_matrices(A, B, precompute=True,
n_jobs = n_jobs)
new_model._init()
- if (A.dtype != dtype):
- A = A.astype(dtype)
- if (B.dtype != dtype):
- B = B.astype(dtype)
-
new_model.A_ = A
new_model.B_ = B
@@ -6386,7 +6337,7 @@ def __init__(self, k=50, lambda_=1e1, method="lbfgs", use_cg=True,
maxiter=10000, niter=10, parallelize="separate", corr_pairs=7,
max_cg_steps=3, precondition_cg=False, finalize_chol=True,
NA_as_zero=False, use_float=False,
- random_state=1, verbose=True, print_every=100,
+ random_state=1, verbose=False, print_every=100,
produce_dicts=False, handle_interrupt=True,
nthreads=-1, n_jobs=None):
self.k = k
@@ -7907,7 +7858,7 @@ class ContentBased(_OMF_Base):
"""
def __init__(self, k=20, lambda_=1e2, user_bias=False, item_bias=False,
add_intercepts=True, maxiter=3000, corr_pairs=3,
- parallelize="separate", verbose=True, print_every=100,
+ parallelize="separate", verbose=False, print_every=100,
random_state=1, use_float=True,
produce_dicts=False, handle_interrupt=True, start_with_ALS=True,
nthreads=-1, n_jobs=None):
@@ -8310,7 +8261,7 @@ def predict_cold(self, U, items):
Predicted ratings for the requested user-item combinations.
"""
assert self.is_fitted_
- items = np.array(items).reshape(-1)
+ items = np.require(items, requirements=["ENSUREARRAY"]).reshape(-1)
assert items.shape[0] == U.shape[0]
_1, items, _2, _3 = self._process_users_items(None, items, None, None)
diff --git a/example/cmfrec_movielens_sideinfo.ipynb b/example/cmfrec_movielens_sideinfo.ipynb
index b65af42..64fab36 100644
--- a/example/cmfrec_movielens_sideinfo.ipynb
+++ b/example/cmfrec_movielens_sideinfo.ipynb
@@ -182,7 +182,6 @@
"
\n",
" | \n",
" ItemId | \n",
- " pc0 | \n",
" pc1 | \n",
" pc2 | \n",
" pc3 | \n",
@@ -191,8 +190,8 @@
" pc6 | \n",
" pc7 | \n",
" pc8 | \n",
+ " pc9 | \n",
" ... | \n",
- " pc40 | \n",
" pc41 | \n",
" pc42 | \n",
" pc43 | \n",
@@ -202,128 +201,129 @@
" pc47 | \n",
" pc48 | \n",
" pc49 | \n",
+ " pc50 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
- " 1.192433 | \n",
- " 2.034965 | \n",
- " 2.679781 | \n",
- " 1.154823 | \n",
- " 0.715302 | \n",
- " 0.982528 | \n",
- " 1.251208 | \n",
- " -0.792800 | \n",
- " 1.605826 | \n",
+ " 1.193171 | \n",
+ " 2.085621 | \n",
+ " 2.634135 | \n",
+ " 1.156088 | \n",
+ " 0.721649 | \n",
+ " 0.995436 | \n",
+ " 1.250474 | \n",
+ " -0.779532 | \n",
+ " 1.616702 | \n",
" ... | \n",
- " -0.312568 | \n",
- " -0.089161 | \n",
- " -0.053227 | \n",
- " 0.230116 | \n",
- " 0.210211 | \n",
- " 0.098109 | \n",
- " -0.267214 | \n",
- " -0.191760 | \n",
- " 0.032658 | \n",
- " 0.065116 | \n",
+ " -0.317134 | \n",
+ " -0.070338 | \n",
+ " -0.019553 | \n",
+ " 0.169051 | \n",
+ " 0.201415 | \n",
+ " -0.094831 | \n",
+ " -0.250461 | \n",
+ " -0.149919 | \n",
+ " -0.031735 | \n",
+ " -0.177708 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
- " -1.333200 | \n",
- " 1.719346 | \n",
- " 1.383137 | \n",
- " 0.788332 | \n",
- " -0.487431 | \n",
- " 0.376546 | \n",
- " 0.803104 | \n",
- " -0.606602 | \n",
- " 0.914494 | \n",
+ " -1.333533 | \n",
+ " 1.743796 | \n",
+ " 1.352161 | \n",
+ " 0.795724 | \n",
+ " -0.484175 | \n",
+ " 0.380645 | \n",
+ " 0.804462 | \n",
+ " -0.598527 | \n",
+ " 0.917250 | \n",
" ... | \n",
- " 0.265190 | \n",
- " -0.294507 | \n",
- " 0.058127 | \n",
- " 0.013155 | \n",
- " 0.232314 | \n",
- " 0.332297 | \n",
- " 0.271467 | \n",
- " 0.112416 | \n",
- " -0.111115 | \n",
- " -0.042173 | \n",
+ " 0.300060 | \n",
+ " -0.261956 | \n",
+ " 0.054457 | \n",
+ " 0.003863 | \n",
+ " 0.304605 | \n",
+ " -0.315796 | \n",
+ " 0.360203 | \n",
+ " 0.152770 | \n",
+ " 0.144790 | \n",
+ " -0.096549 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
- " -1.363421 | \n",
- " -0.034093 | \n",
- " 0.528633 | \n",
- " -0.312122 | \n",
- " 0.468820 | \n",
- " 0.164593 | \n",
- " 0.021909 | \n",
- " 0.161554 | \n",
- " -0.231992 | \n",
+ " -1.363395 | \n",
+ " -0.017107 | \n",
+ " 0.530395 | \n",
+ " -0.316202 | \n",
+ " 0.469430 | \n",
+ " 0.164630 | \n",
+ " 0.019083 | \n",
+ " 0.159188 | \n",
+ " -0.232969 | \n",
" ... | \n",
- " 0.212216 | \n",
- " -0.103897 | \n",
- " -0.279957 | \n",
- " 0.032861 | \n",
- " 0.054336 | \n",
- " 0.212665 | \n",
- " -0.174429 | \n",
- " -0.105532 | \n",
- " -0.147704 | \n",
- " 0.137516 | \n",
+ " 0.215020 | \n",
+ " -0.060682 | \n",
+ " -0.280852 | \n",
+ " 0.001087 | \n",
+ " 0.084960 | \n",
+ " -0.257190 | \n",
+ " -0.136963 | \n",
+ " -0.113914 | \n",
+ " 0.128352 | \n",
+ " -0.203658 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
- " -1.238094 | \n",
- " -1.014399 | \n",
- " 0.790394 | \n",
- " -0.296004 | \n",
- " -0.095043 | \n",
- " -0.052266 | \n",
- " -0.180244 | \n",
- " -0.768811 | \n",
- " -0.400559 | \n",
+ " -1.237840 | \n",
+ " -0.993731 | \n",
+ " 0.809815 | \n",
+ " -0.303009 | \n",
+ " -0.088991 | \n",
+ " -0.049621 | \n",
+ " -0.179544 | \n",
+ " -0.771278 | \n",
+ " -0.400499 | \n",
" ... | \n",
- " 0.074246 | \n",
- " 0.033976 | \n",
- " -0.225773 | \n",
- " 0.416155 | \n",
- " 0.282287 | \n",
- " -0.324412 | \n",
- " -0.228171 | \n",
- " -0.191667 | \n",
- " -0.488943 | \n",
- " -0.468794 | \n",
+ " 0.066207 | \n",
+ " 0.056054 | \n",
+ " -0.223027 | \n",
+ " 0.400157 | \n",
+ " 0.292300 | \n",
+ " 0.260936 | \n",
+ " -0.307608 | \n",
+ " -0.224141 | \n",
+ " 0.488955 | \n",
+ " 0.439189 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
- " -1.613220 | \n",
- " -0.280142 | \n",
- " 1.119149 | \n",
- " -0.130238 | \n",
- " 0.397091 | \n",
- " 0.187158 | \n",
- " 0.108864 | \n",
- " -0.273748 | \n",
- " -0.260166 | \n",
+ " -1.611499 | \n",
+ " -0.251899 | \n",
+ " 1.126443 | \n",
+ " -0.135702 | \n",
+ " 0.403340 | \n",
+ " 0.187289 | \n",
+ " 0.108451 | \n",
+ " -0.275341 | \n",
+ " -0.261142 | \n",
" ... | \n",
- " 0.110984 | \n",
- " -0.126241 | \n",
- " -0.234988 | \n",
- " 0.487649 | \n",
- " -0.027990 | \n",
- " 0.103862 | \n",
- " -0.218475 | \n",
- " -0.315778 | \n",
- " -0.070719 | \n",
- " 0.052140 | \n",
+ " 0.109560 | \n",
+ " -0.086042 | \n",
+ " -0.236327 | \n",
+ " 0.461589 | \n",
+ " 0.013350 | \n",
+ " -0.192557 | \n",
+ " -0.234025 | \n",
+ " -0.369643 | \n",
+ " -0.041060 | \n",
+ " -0.074656 | \n",
"
\n",
" \n",
"\n",
@@ -331,26 +331,26 @@
""
],
"text/plain": [
- " ItemId pc0 pc1 pc2 pc3 pc4 pc5 \\\n",
- "0 1 1.192433 2.034965 2.679781 1.154823 0.715302 0.982528 \n",
- "1 2 -1.333200 1.719346 1.383137 0.788332 -0.487431 0.376546 \n",
- "2 3 -1.363421 -0.034093 0.528633 -0.312122 0.468820 0.164593 \n",
- "3 4 -1.238094 -1.014399 0.790394 -0.296004 -0.095043 -0.052266 \n",
- "4 5 -1.613220 -0.280142 1.119149 -0.130238 0.397091 0.187158 \n",
+ " ItemId pc1 pc2 pc3 pc4 pc5 pc6 \\\n",
+ "0 1 1.193171 2.085621 2.634135 1.156088 0.721649 0.995436 \n",
+ "1 2 -1.333533 1.743796 1.352161 0.795724 -0.484175 0.380645 \n",
+ "2 3 -1.363395 -0.017107 0.530395 -0.316202 0.469430 0.164630 \n",
+ "3 4 -1.237840 -0.993731 0.809815 -0.303009 -0.088991 -0.049621 \n",
+ "4 5 -1.611499 -0.251899 1.126443 -0.135702 0.403340 0.187289 \n",
"\n",
- " pc6 pc7 pc8 ... pc40 pc41 pc42 pc43 \\\n",
- "0 1.251208 -0.792800 1.605826 ... -0.312568 -0.089161 -0.053227 0.230116 \n",
- "1 0.803104 -0.606602 0.914494 ... 0.265190 -0.294507 0.058127 0.013155 \n",
- "2 0.021909 0.161554 -0.231992 ... 0.212216 -0.103897 -0.279957 0.032861 \n",
- "3 -0.180244 -0.768811 -0.400559 ... 0.074246 0.033976 -0.225773 0.416155 \n",
- "4 0.108864 -0.273748 -0.260166 ... 0.110984 -0.126241 -0.234988 0.487649 \n",
+ " pc7 pc8 pc9 ... pc41 pc42 pc43 pc44 \\\n",
+ "0 1.250474 -0.779532 1.616702 ... -0.317134 -0.070338 -0.019553 0.169051 \n",
+ "1 0.804462 -0.598527 0.917250 ... 0.300060 -0.261956 0.054457 0.003863 \n",
+ "2 0.019083 0.159188 -0.232969 ... 0.215020 -0.060682 -0.280852 0.001087 \n",
+ "3 -0.179544 -0.771278 -0.400499 ... 0.066207 0.056054 -0.223027 0.400157 \n",
+ "4 0.108451 -0.275341 -0.261142 ... 0.109560 -0.086042 -0.236327 0.461589 \n",
"\n",
- " pc44 pc45 pc46 pc47 pc48 pc49 \n",
- "0 0.210211 0.098109 -0.267214 -0.191760 0.032658 0.065116 \n",
- "1 0.232314 0.332297 0.271467 0.112416 -0.111115 -0.042173 \n",
- "2 0.054336 0.212665 -0.174429 -0.105532 -0.147704 0.137516 \n",
- "3 0.282287 -0.324412 -0.228171 -0.191667 -0.488943 -0.468794 \n",
- "4 -0.027990 0.103862 -0.218475 -0.315778 -0.070719 0.052140 \n",
+ " pc45 pc46 pc47 pc48 pc49 pc50 \n",
+ "0 0.201415 -0.094831 -0.250461 -0.149919 -0.031735 -0.177708 \n",
+ "1 0.304605 -0.315796 0.360203 0.152770 0.144790 -0.096549 \n",
+ "2 0.084960 -0.257190 -0.136963 -0.113914 0.128352 -0.203658 \n",
+ "3 0.292300 0.260936 -0.307608 -0.224141 0.488955 0.439189 \n",
+ "4 0.013350 -0.192557 -0.234025 -0.369643 -0.041060 -0.074656 \n",
"\n",
"[5 rows x 51 columns]"
]
@@ -424,122 +424,122 @@
" \n",
" 0 | \n",
" 1 | \n",
- " 1 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
" ... | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
" ... | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
" ... | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
" ... | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
" ... | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
"
\n",
" \n",
"\n",
@@ -548,32 +548,32 @@
],
"text/plain": [
" UserId Gender_F Gender_M Age_1 Age_18 Age_25 Age_35 Age_45 Age_50 \\\n",
- "0 1 1 0 1 0 0 0 0 0 \n",
- "1 2 0 1 0 0 0 0 0 0 \n",
- "2 3 0 1 0 0 1 0 0 0 \n",
- "3 4 0 1 0 0 0 0 1 0 \n",
- "4 5 0 1 0 0 1 0 0 0 \n",
+ "0 1 True False True False False False False False \n",
+ "1 2 False True False False False False False False \n",
+ "2 3 False True False False True False False False \n",
+ "3 4 False True False False False False True False \n",
+ "4 5 False True False False True False False False \n",
"\n",
" Age_56 ... Occupation_unemployed Occupation_writer \\\n",
- "0 0 ... 0 0 \n",
- "1 1 ... 0 0 \n",
- "2 0 ... 0 0 \n",
- "3 0 ... 0 0 \n",
- "4 0 ... 0 1 \n",
+ "0 False ... False False \n",
+ "1 True ... False False \n",
+ "2 False ... False False \n",
+ "3 False ... False False \n",
+ "4 False ... False True \n",
"\n",
" Region_Middle Atlantic Region_Midwest Region_New England Region_South \\\n",
- "0 0 1 0 0 \n",
- "1 0 0 0 1 \n",
- "2 0 1 0 0 \n",
- "3 0 0 1 0 \n",
- "4 0 1 0 0 \n",
+ "0 False True False False \n",
+ "1 False False False True \n",
+ "2 False True False False \n",
+ "3 False False True False \n",
+ "4 False True False False \n",
"\n",
" Region_Southwest Region_UnknownOrNonUS Region_UsOther Region_West \n",
- "0 0 0 0 0 \n",
- "1 0 0 0 0 \n",
- "2 0 0 0 0 \n",
- "3 0 0 0 0 \n",
- "4 0 0 0 0 \n",
+ "0 False False False False \n",
+ "1 False False False False \n",
+ "2 False False False False \n",
+ "3 False False False False \n",
+ "4 False False False False \n",
"\n",
"[5 rows x 39 columns]"
]
@@ -627,8 +627,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "CPU times: user 13 s, sys: 105 ms, total: 13.1 s\n",
- "Wall time: 892 ms\n"
+ "CPU times: user 6.75 s, sys: 1.56 s, total: 8.31 s\n",
+ "Wall time: 592 ms\n"
]
},
{
@@ -684,8 +684,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "CPU times: user 17.2 s, sys: 168 ms, total: 17.4 s\n",
- "Wall time: 1.18 s\n"
+ "CPU times: user 11.2 s, sys: 13 s, total: 24.2 s\n",
+ "Wall time: 1.5 s\n"
]
},
{
@@ -740,8 +740,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "CPU times: user 26min 6s, sys: 9.69 s, total: 26min 16s\n",
- "Wall time: 1min 39s\n"
+ "CPU times: user 13min 8s, sys: 23min 31s, total: 36min 39s\n",
+ "Wall time: 1min 57s\n"
]
},
{
@@ -761,9 +761,9 @@
"from cmfrec import ContentBased\n",
"\n",
"model_content_based = ContentBased(k=40, maxiter=0, user_bias=False, item_bias=False)\n",
- "model_content_based.fit(X=ratings.loc[ratings.ItemId.isin(item_sideinfo_pca.ItemId)],\n",
+ "model_content_based.fit(X=ratings.loc[lambda x: x[\"ItemId\"].isin(item_sideinfo_pca[\"ItemId\"])],\n",
" U=user_side_info,\n",
- " I=item_sideinfo_pca.loc[item_sideinfo_pca.ItemId.isin(ratings.ItemId)])"
+ " I=item_sideinfo_pca.loc[lambda x: x[\"ItemId\"].isin(ratings[\"ItemId\"])])"
]
},
{
@@ -784,8 +784,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "CPU times: user 1.02 s, sys: 39.9 ms, total: 1.06 s\n",
- "Wall time: 70.6 ms\n"
+ "CPU times: user 304 ms, sys: 800 ms, total: 1.1 s\n",
+ "Wall time: 105 ms\n"
]
},
{
@@ -852,35 +852,35 @@
" \n",
" \n",
" UserId | \n",
- " 948.0 | \n",
+ " 948 | \n",
"
\n",
" \n",
" Gender_M | \n",
- " 1.0 | \n",
+ " True | \n",
"
\n",
" \n",
" Age_56 | \n",
- " 1.0 | \n",
+ " True | \n",
"
\n",
" \n",
" Occupation_programmer | \n",
- " 1.0 | \n",
+ " True | \n",
"
\n",
" \n",
" Region_Midwest | \n",
- " 1.0 | \n",
+ " True | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " 947\n",
- "UserId 948.0\n",
- "Gender_M 1.0\n",
- "Age_56 1.0\n",
- "Occupation_programmer 1.0\n",
- "Region_Midwest 1.0"
+ " 947\n",
+ "UserId 948\n",
+ "Gender_M True\n",
+ "Age_56 True\n",
+ "Occupation_programmer True\n",
+ "Region_Midwest True"
]
},
"execution_count": 9,
@@ -889,7 +889,7 @@
}
],
"source": [
- "user_side_info.loc[user_side_info.UserId == 948].T.where(lambda x: x > 0).dropna()"
+ "user_side_info.loc[user_side_info[\"UserId\"] == 948].T.where(lambda x: x > 0).dropna()"
]
},
{
@@ -1026,11 +1026,13 @@
}
],
"source": [
- "ratings\\\n",
- " .loc[ratings.UserId == 948]\\\n",
- " .sort_values(\"Rating\", ascending=False)\\\n",
- " .assign(Movie=lambda x: x.ItemId.map(movie_id_to_title))\\\n",
- " .head(10)"
+ "(\n",
+ " ratings\n",
+ " .loc[lambda x: x[\"UserId\"] == 948]\n",
+ " .sort_values(\"Rating\", ascending=False)\n",
+ " .assign(Movie=lambda x: x[\"ItemId\"].map(movie_id_to_title))\n",
+ " .head(10)\n",
+ ")"
]
},
{
@@ -1179,11 +1181,13 @@
}
],
"source": [
- "ratings\\\n",
- " .loc[ratings.UserId == 948]\\\n",
- " .sort_values(\"Rating\", ascending=True)\\\n",
- " .assign(Movie=lambda x: x.ItemId.map(movie_id_to_title))\\\n",
- " .head(10)"
+ "(\n",
+ " ratings\n",
+ " .loc[lambda x: x[\"UserId\"] == 948]\n",
+ " .sort_values(\"Rating\", ascending=True)\n",
+ " .assign(Movie=lambda x: x[\"ItemId\"].map(movie_id_to_title))\n",
+ " .head(10)\n",
+ ")"
]
},
{
@@ -1200,8 +1204,8 @@
"outputs": [],
"source": [
"### Will exclude already-seen movies\n",
- "exclude = ratings.ItemId.loc[ratings.UserId == 948]\n",
- "exclude_cb = exclude.loc[exclude.isin(item_sideinfo_pca.ItemId)]\n",
+ "exclude = ratings[\"ItemId\"].loc[ratings[\"UserId\"] == 948]\n",
+ "exclude_cb = exclude.loc[lambda x: x.isin(item_sideinfo_pca[\"ItemId\"])]\n",
"\n",
"### Recommended lists with those excluded\n",
"recommended_non_personalized = model_non_personalized.topN(user=948, n=10, exclude=exclude)\n",
@@ -1259,16 +1263,16 @@
"10) - City Lights (1931) - Average Rating: 4.39 - Number of ratings: 271\n",
"----------------\n",
"Recommended from ratings-only model\n",
- "1) - Babe (1995) - Average Rating: 3.89 - Number of ratings: 1751\n",
- "2) - Singin' in the Rain (1952) - Average Rating: 4.28 - Number of ratings: 751\n",
- "3) - Mummy, The (1932) - Average Rating: 3.54 - Number of ratings: 162\n",
- "4) - Gold Rush, The (1925) - Average Rating: 4.19 - Number of ratings: 275\n",
- "5) - Bride of Frankenstein (1935) - Average Rating: 3.91 - Number of ratings: 216\n",
- "6) - City Lights (1931) - Average Rating: 4.39 - Number of ratings: 271\n",
- "7) - Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922) - Average Rating: 3.99 - Number of ratings: 238\n",
+ "1) - Arsenic and Old Lace (1944) - Average Rating: 4.17 - Number of ratings: 672\n",
+ "2) - Beauty and the Beast (1991) - Average Rating: 3.89 - Number of ratings: 1060\n",
+ "3) - Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922) - Average Rating: 3.99 - Number of ratings: 238\n",
+ "4) - It's a Wonderful Life (1946) - Average Rating: 4.3 - Number of ratings: 729\n",
+ "5) - Invasion of the Body Snatchers (1956) - Average Rating: 3.91 - Number of ratings: 628\n",
+ "6) - Hurricane, The (1999) - Average Rating: 3.85 - Number of ratings: 509\n",
+ "7) - Contender, The (2000) - Average Rating: 3.78 - Number of ratings: 388\n",
"8) - Wolf Man, The (1941) - Average Rating: 3.76 - Number of ratings: 134\n",
- "9) - American History X (1998) - Average Rating: 4.23 - Number of ratings: 640\n",
- "10) - Chariots of Fire (1981) - Average Rating: 3.8 - Number of ratings: 634\n",
+ "9) - Apostle, The (1997) - Average Rating: 3.73 - Number of ratings: 471\n",
+ "10) - Mummy, The (1932) - Average Rating: 3.54 - Number of ratings: 162\n",
"----------------\n",
"Recommended from attributes-only model\n",
"1) - Shawshank Redemption, The (1994) - Average Rating: 4.55 - Number of ratings: 2227\n",
@@ -1277,22 +1281,22 @@
"4) - Jean de Florette (1986) - Average Rating: 4.32 - Number of ratings: 216\n",
"5) - It Happened One Night (1934) - Average Rating: 4.28 - Number of ratings: 374\n",
"6) - Central Station (Central do Brasil) (1998) - Average Rating: 4.28 - Number of ratings: 215\n",
- "7) - Best Years of Our Lives, The (1946) - Average Rating: 4.12 - Number of ratings: 236\n",
- "8) - Man Who Would Be King, The (1975) - Average Rating: 4.13 - Number of ratings: 310\n",
- "9) - In the Heat of the Night (1967) - Average Rating: 4.13 - Number of ratings: 348\n",
- "10) - Double Indemnity (1944) - Average Rating: 4.42 - Number of ratings: 551\n",
+ "7) - Man Who Would Be King, The (1975) - Average Rating: 4.13 - Number of ratings: 310\n",
+ "8) - Best Years of Our Lives, The (1946) - Average Rating: 4.12 - Number of ratings: 236\n",
+ "9) - Double Indemnity (1944) - Average Rating: 4.42 - Number of ratings: 551\n",
+ "10) - In the Heat of the Night (1967) - Average Rating: 4.13 - Number of ratings: 348\n",
"----------------\n",
"Recommended from hybrid model\n",
- "1) - Babe (1995) - Average Rating: 3.89 - Number of ratings: 1751\n",
- "2) - It's a Wonderful Life (1946) - Average Rating: 4.3 - Number of ratings: 729\n",
+ "1) - It's a Wonderful Life (1946) - Average Rating: 4.3 - Number of ratings: 729\n",
+ "2) - Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922) - Average Rating: 3.99 - Number of ratings: 238\n",
"3) - Beauty and the Beast (1991) - Average Rating: 3.89 - Number of ratings: 1060\n",
- "4) - Singin' in the Rain (1952) - Average Rating: 4.28 - Number of ratings: 751\n",
- "5) - Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922) - Average Rating: 3.99 - Number of ratings: 238\n",
- "6) - Bride of Frankenstein (1935) - Average Rating: 3.91 - Number of ratings: 216\n",
- "7) - Gold Rush, The (1925) - Average Rating: 4.19 - Number of ratings: 275\n",
- "8) - Invasion of the Body Snatchers (1956) - Average Rating: 3.91 - Number of ratings: 628\n",
- "9) - Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) - Average Rating: 4.56 - Number of ratings: 628\n",
- "10) - Green Mile, The (1999) - Average Rating: 4.15 - Number of ratings: 1222\n"
+ "4) - Arsenic and Old Lace (1944) - Average Rating: 4.17 - Number of ratings: 672\n",
+ "5) - Invasion of the Body Snatchers (1956) - Average Rating: 3.91 - Number of ratings: 628\n",
+ "6) - Mr. Smith Goes to Washington (1939) - Average Rating: 4.24 - Number of ratings: 383\n",
+ "7) - Life Is Beautiful (La Vita è bella) (1997) - Average Rating: 4.33 - Number of ratings: 1152\n",
+ "8) - Gold Rush, The (1925) - Average Rating: 4.19 - Number of ratings: 275\n",
+ "9) - Bride of Frankenstein (1935) - Average Rating: 3.91 - Number of ratings: 216\n",
+ "10) - Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) - Average Rating: 4.56 - Number of ratings: 628\n"
]
}
],
@@ -1386,15 +1390,15 @@
"output_type": "stream",
"text": [
"1) - Plan 9 from Outer Space (1958) - Average Rating: 2.63 - Number of ratings: 249\n",
- "2) - Anne Frank Remembered (1995) - Average Rating: 4.1 - Number of ratings: 41\n",
- "3) - Next Friday (1999) - Average Rating: 2.6 - Number of ratings: 168\n",
- "4) - Muppet Christmas Carol, The (1992) - Average Rating: 3.61 - Number of ratings: 262\n",
- "5) - Snow Day (2000) - Average Rating: 2.21 - Number of ratings: 122\n",
- "6) - Black Mask (Hak hap) (1996) - Average Rating: 3.08 - Number of ratings: 66\n",
- "7) - Foreign Student (1994) - Average Rating: 3.0 - Number of ratings: 2\n",
- "8) - Ballad of Narayama, The (Narayama Bushiko) (1982) - Average Rating: 3.95 - Number of ratings: 19\n",
- "9) - Around the World in 80 Days (1956) - Average Rating: 3.6 - Number of ratings: 269\n",
- "10) - Faust (1994) - Average Rating: 3.48 - Number of ratings: 31\n"
+ "2) - East-West (Est-ouest) (1999) - Average Rating: 3.77 - Number of ratings: 103\n",
+ "3) - Rugrats Movie, The (1998) - Average Rating: 2.78 - Number of ratings: 141\n",
+ "4) - Taste of Cherry (1997) - Average Rating: 3.53 - Number of ratings: 32\n",
+ "5) - Julien Donkey-Boy (1999) - Average Rating: 3.33 - Number of ratings: 12\n",
+ "6) - Original Kings of Comedy, The (2000) - Average Rating: 3.23 - Number of ratings: 147\n",
+ "7) - Maya Lin: A Strong Clear Vision (1994) - Average Rating: 4.1 - Number of ratings: 59\n",
+ "8) - Double Life of Veronique, The (La Double Vie de Véronique) (1991) - Average Rating: 3.94 - Number of ratings: 129\n",
+ "9) - Crash (1996) - Average Rating: 2.76 - Number of ratings: 141\n",
+ "10) - Faraway, So Close (In Weiter Ferne, So Nah!) (1993) - Average Rating: 3.71 - Number of ratings: 66\n"
]
}
],
@@ -1424,15 +1428,15 @@
"output_type": "stream",
"text": [
"1) - Wrong Trousers, The (1993) - Average Rating: 4.51 - Number of ratings: 882\n",
- "2) - Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) - Average Rating: 4.56 - Number of ratings: 628\n",
- "3) - It's a Wonderful Life (1946) - Average Rating: 4.3 - Number of ratings: 729\n",
- "4) - Third Man, The (1949) - Average Rating: 4.45 - Number of ratings: 480\n",
- "5) - Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922) - Average Rating: 3.99 - Number of ratings: 238\n",
+ "2) - Willy Wonka and the Chocolate Factory (1971) - Average Rating: 3.86 - Number of ratings: 1313\n",
+ "3) - Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) - Average Rating: 4.56 - Number of ratings: 628\n",
+ "4) - It's a Wonderful Life (1946) - Average Rating: 4.3 - Number of ratings: 729\n",
+ "5) - Third Man, The (1949) - Average Rating: 4.45 - Number of ratings: 480\n",
"6) - Close Shave, A (1995) - Average Rating: 4.52 - Number of ratings: 657\n",
- "7) - Singin' in the Rain (1952) - Average Rating: 4.28 - Number of ratings: 751\n",
- "8) - Shadow of a Doubt (1943) - Average Rating: 4.27 - Number of ratings: 233\n",
- "9) - Citizen Kane (1941) - Average Rating: 4.39 - Number of ratings: 1116\n",
- "10) - Christmas Carol, A (1938) - Average Rating: 3.99 - Number of ratings: 194\n"
+ "7) - Grand Day Out, A (1992) - Average Rating: 4.36 - Number of ratings: 473\n",
+ "8) - Citizen Kane (1941) - Average Rating: 4.39 - Number of ratings: 1116\n",
+ "9) - Singin' in the Rain (1952) - Average Rating: 4.28 - Number of ratings: 751\n",
+ "10) - Rebecca (1940) - Average Rating: 4.2 - Number of ratings: 386\n"
]
}
],
@@ -1466,22 +1470,22 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "1) - Babe (1995) - Average Rating: 3.89 - Number of ratings: 1751\n",
- "2) - It's a Wonderful Life (1946) - Average Rating: 4.3 - Number of ratings: 729\n",
+ "1) - It's a Wonderful Life (1946) - Average Rating: 4.3 - Number of ratings: 729\n",
+ "2) - Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922) - Average Rating: 3.99 - Number of ratings: 238\n",
"3) - Beauty and the Beast (1991) - Average Rating: 3.89 - Number of ratings: 1060\n",
- "4) - Singin' in the Rain (1952) - Average Rating: 4.28 - Number of ratings: 751\n",
- "5) - Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922) - Average Rating: 3.99 - Number of ratings: 238\n",
- "6) - Bride of Frankenstein (1935) - Average Rating: 3.91 - Number of ratings: 216\n",
- "7) - Gold Rush, The (1925) - Average Rating: 4.19 - Number of ratings: 275\n",
- "8) - Invasion of the Body Snatchers (1956) - Average Rating: 3.91 - Number of ratings: 628\n",
- "9) - Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) - Average Rating: 4.56 - Number of ratings: 628\n",
- "10) - Green Mile, The (1999) - Average Rating: 4.15 - Number of ratings: 1222\n"
+ "4) - Arsenic and Old Lace (1944) - Average Rating: 4.17 - Number of ratings: 672\n",
+ "5) - Invasion of the Body Snatchers (1956) - Average Rating: 3.91 - Number of ratings: 628\n",
+ "6) - Mr. Smith Goes to Washington (1939) - Average Rating: 4.24 - Number of ratings: 383\n",
+ "7) - Life Is Beautiful (La Vita è bella) (1997) - Average Rating: 4.33 - Number of ratings: 1152\n",
+ "8) - Gold Rush, The (1925) - Average Rating: 4.19 - Number of ratings: 275\n",
+ "9) - Bride of Frankenstein (1935) - Average Rating: 3.91 - Number of ratings: 216\n",
+ "10) - Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) - Average Rating: 4.56 - Number of ratings: 628\n"
]
}
],
"source": [
- "print_reclist(model_with_sideinfo.topN_warm(X_col=ratings.ItemId.loc[ratings.UserId == 948],\n",
- " X_val=ratings.Rating.loc[ratings.UserId == 948],\n",
+ "print_reclist(model_with_sideinfo.topN_warm(X_col=ratings[\"ItemId\"].loc[ratings[\"UserId\"] == 948],\n",
+ " X_val=ratings[\"Rating\"].loc[ratings[\"UserId\"] == 948],\n",
" exclude=exclude))"
]
},
@@ -1494,23 +1498,23 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "1) - Babe (1995) - Average Rating: 3.89 - Number of ratings: 1751\n",
- "2) - It's a Wonderful Life (1946) - Average Rating: 4.3 - Number of ratings: 729\n",
+ "1) - It's a Wonderful Life (1946) - Average Rating: 4.3 - Number of ratings: 729\n",
+ "2) - Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922) - Average Rating: 3.99 - Number of ratings: 238\n",
"3) - Beauty and the Beast (1991) - Average Rating: 3.89 - Number of ratings: 1060\n",
- "4) - Singin' in the Rain (1952) - Average Rating: 4.28 - Number of ratings: 751\n",
- "5) - Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922) - Average Rating: 3.99 - Number of ratings: 238\n",
- "6) - Bride of Frankenstein (1935) - Average Rating: 3.91 - Number of ratings: 216\n",
- "7) - Gold Rush, The (1925) - Average Rating: 4.19 - Number of ratings: 275\n",
- "8) - Invasion of the Body Snatchers (1956) - Average Rating: 3.91 - Number of ratings: 628\n",
- "9) - Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) - Average Rating: 4.56 - Number of ratings: 628\n",
- "10) - Green Mile, The (1999) - Average Rating: 4.15 - Number of ratings: 1222\n"
+ "4) - Arsenic and Old Lace (1944) - Average Rating: 4.17 - Number of ratings: 672\n",
+ "5) - Invasion of the Body Snatchers (1956) - Average Rating: 3.91 - Number of ratings: 628\n",
+ "6) - Mr. Smith Goes to Washington (1939) - Average Rating: 4.24 - Number of ratings: 383\n",
+ "7) - Life Is Beautiful (La Vita è bella) (1997) - Average Rating: 4.33 - Number of ratings: 1152\n",
+ "8) - Gold Rush, The (1925) - Average Rating: 4.19 - Number of ratings: 275\n",
+ "9) - Bride of Frankenstein (1935) - Average Rating: 3.91 - Number of ratings: 216\n",
+ "10) - Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) - Average Rating: 4.56 - Number of ratings: 628\n"
]
}
],
"source": [
- "print_reclist(model_with_sideinfo.topN_warm(X_col=ratings.ItemId.loc[ratings.UserId == 948],\n",
- " X_val=ratings.Rating.loc[ratings.UserId == 948],\n",
- " U=user_side_info.loc[user_side_info.UserId == 948],\n",
+ "print_reclist(model_with_sideinfo.topN_warm(X_col=ratings[\"ItemId\"].loc[ratings[\"UserId\"] == 948],\n",
+ " X_val=ratings[\"Rating\"].loc[ratings[\"UserId\"] == 948],\n",
+ " U=user_side_info.loc[lambda x: x[\"UserId\"] == 948],\n",
" exclude=exclude))"
]
},
@@ -1531,14 +1535,18 @@
"6) - Wallace & Gromit: The Best of Aardman Animation (1996) - Average Rating: 4.43 - Number of ratings: 438\n",
"7) - Double Indemnity (1944) - Average Rating: 4.42 - Number of ratings: 551\n",
"8) - Third Man, The (1949) - Average Rating: 4.45 - Number of ratings: 480\n",
- "9) - Life Is Beautiful (La Vita � bella) (1997) - Average Rating: 4.33 - Number of ratings: 1152\n",
+ "9) - Life Is Beautiful (La Vita è bella) (1997) - Average Rating: 4.33 - Number of ratings: 1152\n",
"10) - Grand Day Out, A (1992) - Average Rating: 4.36 - Number of ratings: 473\n"
]
}
],
"source": [
- "print_reclist(model_with_sideinfo.topN_cold(U=user_side_info.loc[user_side_info.UserId == 948].drop(\"UserId\", axis=1),\n",
- " exclude=exclude))"
+ "print_reclist(\n",
+ " model_with_sideinfo.topN_cold(\n",
+ " U=user_side_info.loc[lambda x: x[\"UserId\"] == 948].drop(\"UserId\", axis=1),\n",
+ " exclude=exclude\n",
+ " )\n",
+ ")"
]
},
{
@@ -1563,16 +1571,20 @@
"4) - Jean de Florette (1986) - Average Rating: 4.32 - Number of ratings: 216\n",
"5) - It Happened One Night (1934) - Average Rating: 4.28 - Number of ratings: 374\n",
"6) - Central Station (Central do Brasil) (1998) - Average Rating: 4.28 - Number of ratings: 215\n",
- "7) - Best Years of Our Lives, The (1946) - Average Rating: 4.12 - Number of ratings: 236\n",
- "8) - Man Who Would Be King, The (1975) - Average Rating: 4.13 - Number of ratings: 310\n",
- "9) - In the Heat of the Night (1967) - Average Rating: 4.13 - Number of ratings: 348\n",
- "10) - Double Indemnity (1944) - Average Rating: 4.42 - Number of ratings: 551\n"
+ "7) - Man Who Would Be King, The (1975) - Average Rating: 4.13 - Number of ratings: 310\n",
+ "8) - Best Years of Our Lives, The (1946) - Average Rating: 4.12 - Number of ratings: 236\n",
+ "9) - Double Indemnity (1944) - Average Rating: 4.42 - Number of ratings: 551\n",
+ "10) - In the Heat of the Night (1967) - Average Rating: 4.13 - Number of ratings: 348\n"
]
}
],
"source": [
- "print_reclist(model_content_based.topN_cold(U=user_side_info.loc[user_side_info.UserId == 948].drop(\"UserId\", axis=1),\n",
- " exclude=exclude_cb))"
+ "print_reclist(\n",
+ " model_content_based.topN_cold(\n",
+ " U=user_side_info.loc[lambda x: x[\"UserId\"] == 948].drop(\"UserId\", axis=1),\n",
+ " exclude=exclude_cb\n",
+ " )\n",
+ ")"
]
},
{
@@ -1627,40 +1639,40 @@
"text": [
"Number of ratings in training data: 512972\n",
"Number of ratings in test data type (1): 128221\n",
- "Number of ratings in test data type (2): 153128\n",
- "Number of ratings in test data type (3): 138904\n",
- "Number of ratings in test data type (4): 36450\n"
+ "Number of ratings in test data type (2): 154507\n",
+ "Number of ratings in test data type (3): 139009\n",
+ "Number of ratings in test data type (4): 36774\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
- "users_train, users_test = train_test_split(ratings.UserId.unique(), test_size=0.2, random_state=1)\n",
- "items_train, items_test = train_test_split(ratings.ItemId.unique(), test_size=0.2, random_state=2)\n",
+ "users_train, users_test = train_test_split(ratings[\"UserId\"].unique(), test_size=0.2, random_state=1)\n",
+ "items_train, items_test = train_test_split(ratings[\"ItemId\"].unique(), test_size=0.2, random_state=2)\n",
"\n",
- "ratings_train, ratings_test1 = train_test_split(ratings.loc[ratings.UserId.isin(users_train) &\n",
- " ratings.ItemId.isin(items_train)],\n",
+ "ratings_train, ratings_test1 = train_test_split(ratings.loc[ratings[\"UserId\"].isin(users_train) &\n",
+ " ratings[\"ItemId\"].isin(items_train)],\n",
" test_size=0.2, random_state=123)\n",
- "users_train = ratings_train.UserId.unique()\n",
- "items_train = ratings_train.ItemId.unique()\n",
- "ratings_test1 = ratings_test1.loc[ratings_test1.UserId.isin(users_train) &\n",
- " ratings_test1.ItemId.isin(items_train)]\n",
- "\n",
- "user_attr_train = user_side_info.loc[user_side_info.UserId.isin(users_train)]\n",
- "item_attr_train = item_sideinfo_pca.loc[item_sideinfo_pca.ItemId.isin(items_train)]\n",
- "\n",
- "ratings_test2 = ratings.loc[ratings.UserId.isin(users_train) &\n",
- " ~ratings.ItemId.isin(items_train) &\n",
- " ratings.ItemId.isin(item_sideinfo_pca.ItemId)]\n",
- "ratings_test3 = ratings.loc[~ratings.UserId.isin(users_train) &\n",
- " ratings.ItemId.isin(items_train) &\n",
- " ratings.UserId.isin(user_side_info.UserId) &\n",
- " ratings.ItemId.isin(item_sideinfo_pca.ItemId)]\n",
- "ratings_test4 = ratings.loc[~ratings.UserId.isin(users_train) &\n",
- " ~ratings.ItemId.isin(items_train) &\n",
- " ratings.UserId.isin(user_side_info.UserId) &\n",
- " ratings.ItemId.isin(item_sideinfo_pca.ItemId)]\n",
+ "users_train = ratings_train[\"UserId\"].unique()\n",
+ "items_train = ratings_train[\"ItemId\"].unique()\n",
+ "ratings_test1 = ratings_test1.loc[ratings_test1[\"UserId\"].isin(users_train) &\n",
+ " ratings_test1[\"ItemId\"].isin(items_train)]\n",
+ "\n",
+ "user_attr_train = user_side_info.loc[lambda x: x[\"UserId\"].isin(users_train)]\n",
+ "item_attr_train = item_sideinfo_pca.loc[lambda x: x[\"ItemId\"].isin(items_train)]\n",
+ "\n",
+ "ratings_test2 = ratings.loc[ratings[\"UserId\"].isin(users_train) &\n",
+ " ~ratings[\"ItemId\"].isin(items_train) &\n",
+ " ratings[\"ItemId\"].isin(item_sideinfo_pca[\"ItemId\"])]\n",
+ "ratings_test3 = ratings.loc[~ratings[\"UserId\"].isin(users_train) &\n",
+ " ratings[\"ItemId\"].isin(items_train) &\n",
+ " ratings[\"UserId\"].isin(user_side_info[\"UserId\"]) &\n",
+ " ratings[\"ItemId\"].isin(item_sideinfo_pca[\"ItemId\"])]\n",
+ "ratings_test4 = ratings.loc[~ratings[\"UserId\"].isin(users_train) &\n",
+ " ~ratings[\"ItemId\"].isin(items_train) &\n",
+ " ratings[\"UserId\"].isin(user_side_info[\"UserId\"]) &\n",
+ " ratings[\"ItemId\"].isin(item_sideinfo_pca[\"ItemId\"])]\n",
"\n",
"\n",
"print(\"Number of ratings in training data: %d\" % ratings_train.shape[0])\n",
@@ -1701,8 +1713,8 @@
" U=user_attr_train,\n",
" I=item_attr_train)\n",
"m_contentbased = ContentBased(k=40, user_bias=False, item_bias=False)\\\n",
- " .fit(X=ratings_train.loc[ratings_train.UserId.isin(user_attr_train.UserId) &\n",
- " ratings_train.ItemId.isin(item_attr_train.ItemId)],\n",
+ " .fit(X=ratings_train.loc[ratings_train[\"UserId\"].isin(user_attr_train[\"UserId\"]) &\n",
+ " ratings_train[\"ItemId\"].isin(item_attr_train[\"ItemId\"])],\n",
" U=user_attr_train,\n",
" I=item_attr_train)\n",
"m_mostpopular = MostPopular(user_bias=True)\\\n",
@@ -1726,44 +1738,44 @@
"output_type": "stream",
"text": [
"RMSE type 1 non-personalized model: 0.911 [rho: 0.580]\n",
- "RMSE type 1 ratings-only model: 0.897 [rho: 0.603]\n",
- "RMSE type 1 hybrid model: 0.860 [rho: 0.641]\n",
- "RMSE type 1 content-based model: 0.975 [rho: 0.486]\n"
+ "RMSE type 1 ratings-only model: 0.896 [rho: 0.603]\n",
+ "RMSE type 1 hybrid model: 0.861 [rho: 0.640]\n",
+ "RMSE type 1 content-based model: 0.975 [rho: 0.487]\n"
]
}
],
"source": [
"from sklearn.metrics import mean_squared_error\n",
"\n",
- "pred_contetbased = m_mostpopular.predict(ratings_test1.UserId, ratings_test1.ItemId)\n",
+ "pred_contetbased = m_mostpopular.predict(ratings_test1[\"UserId\"], ratings_test1[\"ItemId\"])\n",
"print(\"RMSE type 1 non-personalized model: %.3f [rho: %.3f]\" %\n",
- " (np.sqrt(mean_squared_error(ratings_test1.Rating,\n",
+ " (np.sqrt(mean_squared_error(ratings_test1[\"Rating\"],\n",
" pred_contetbased,\n",
" squared=True)),\n",
- " np.corrcoef(ratings_test1.Rating, pred_contetbased)[0,1]))\n",
+ " np.corrcoef(ratings_test1[\"Rating\"], pred_contetbased)[0,1]))\n",
"\n",
- "pred_ratingsonly = m_classic.predict(ratings_test1.UserId, ratings_test1.ItemId)\n",
+ "pred_ratingsonly = m_classic.predict(ratings_test1[\"UserId\"], ratings_test1[\"ItemId\"])\n",
"print(\"RMSE type 1 ratings-only model: %.3f [rho: %.3f]\" %\n",
- " (np.sqrt(mean_squared_error(ratings_test1.Rating,\n",
+ " (np.sqrt(mean_squared_error(ratings_test1[\"Rating\"],\n",
" pred_ratingsonly,\n",
" squared=True)),\n",
- " np.corrcoef(ratings_test1.Rating, pred_ratingsonly)[0,1]))\n",
+ " np.corrcoef(ratings_test1[\"Rating\"], pred_ratingsonly)[0,1]))\n",
"\n",
- "pred_hybrid = m_collective.predict(ratings_test1.UserId, ratings_test1.ItemId)\n",
+ "pred_hybrid = m_collective.predict(ratings_test1[\"UserId\"], ratings_test1[\"ItemId\"])\n",
"print(\"RMSE type 1 hybrid model: %.3f [rho: %.3f]\" %\n",
- " (np.sqrt(mean_squared_error(ratings_test1.Rating,\n",
+ " (np.sqrt(mean_squared_error(ratings_test1[\"Rating\"],\n",
" pred_hybrid,\n",
" squared=True)),\n",
- " np.corrcoef(ratings_test1.Rating, pred_hybrid)[0,1]))\n",
+ " np.corrcoef(ratings_test1[\"Rating\"], pred_hybrid)[0,1]))\n",
"\n",
- "test_cb = ratings_test1.loc[ratings_test1.UserId.isin(user_attr_train.UserId) &\n",
- " ratings_test1.ItemId.isin(item_attr_train.ItemId)]\n",
- "pred_contentbased = m_contentbased.predict(test_cb.UserId, test_cb.ItemId)\n",
+ "test_cb = ratings_test1.loc[ratings_test1[\"UserId\"].isin(user_attr_train[\"UserId\"]) &\n",
+ " ratings_test1[\"ItemId\"].isin(item_attr_train[\"ItemId\"])]\n",
+ "pred_contentbased = m_contentbased.predict(test_cb[\"UserId\"], test_cb[\"ItemId\"])\n",
"print(\"RMSE type 1 content-based model: %.3f [rho: %.3f]\" %\n",
- " (np.sqrt(mean_squared_error(test_cb.Rating,\n",
+ " (np.sqrt(mean_squared_error(test_cb[\"Rating\"],\n",
" pred_contentbased,\n",
" squared=True)),\n",
- " np.corrcoef(test_cb.Rating, pred_contentbased)[0,1]))"
+ " np.corrcoef(test_cb[\"Rating\"], pred_contentbased)[0,1]))"
]
},
{
@@ -1782,27 +1794,27 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "RMSE type 2 hybrid model: 1.023 [rho: 0.424]\n",
- "RMSE type 2 content-based model: 0.977 [rho: 0.484]\n"
+ "RMSE type 2 hybrid model: 1.025 [rho: 0.424]\n",
+ "RMSE type 2 content-based model: 0.977 [rho: 0.486]\n"
]
}
],
"source": [
- "pred_hybrid = m_collective.predict_new(ratings_test2.UserId,\n",
- " item_attr_test.loc[ratings_test2.ItemId])\n",
+ "pred_hybrid = m_collective.predict_new(ratings_test2[\"UserId\"],\n",
+ " item_attr_test.loc[ratings_test2[\"ItemId\"]])\n",
"print(\"RMSE type 2 hybrid model: %.3f [rho: %.3f]\" %\n",
- " (np.sqrt(mean_squared_error(ratings_test2.Rating,\n",
+ " (np.sqrt(mean_squared_error(ratings_test2[\"Rating\"],\n",
" pred_hybrid,\n",
" squared=True)),\n",
- " np.corrcoef(ratings_test2.Rating, pred_hybrid)[0,1]))\n",
+ " np.corrcoef(ratings_test2[\"Rating\"], pred_hybrid)[0,1]))\n",
"\n",
- "pred_contentbased = m_contentbased.predict_new(user_attr_test.loc[ratings_test2.UserId],\n",
- " item_attr_test.loc[ratings_test2.ItemId])\n",
+ "pred_contentbased = m_contentbased.predict_new(user_attr_test.loc[ratings_test2[\"UserId\"]],\n",
+ " item_attr_test.loc[ratings_test2[\"ItemId\"]])\n",
"print(\"RMSE type 2 content-based model: %.3f [rho: %.3f]\" %\n",
- " (np.sqrt(mean_squared_error(ratings_test2.Rating,\n",
+ " (np.sqrt(mean_squared_error(ratings_test2[\"Rating\"],\n",
" pred_contentbased,\n",
" squared=True)),\n",
- " np.corrcoef(ratings_test2.Rating, pred_contentbased)[0,1]))"
+ " np.corrcoef(ratings_test2[\"Rating\"], pred_contentbased)[0,1]))"
]
},
{
@@ -1827,21 +1839,21 @@
}
],
"source": [
- "pred_hybrid = m_collective.predict_cold_multiple(item=ratings_test3.ItemId,\n",
- " U=user_attr_test.loc[ratings_test3.UserId])\n",
+ "pred_hybrid = m_collective.predict_cold_multiple(item=ratings_test3[\"ItemId\"],\n",
+ " U=user_attr_test.loc[ratings_test3[\"UserId\"]])\n",
"print(\"RMSE type 3 hybrid model: %.3f [rho: %.3f]\" %\n",
- " (np.sqrt(mean_squared_error(ratings_test3.Rating,\n",
+ " (np.sqrt(mean_squared_error(ratings_test3[\"Rating\"],\n",
" pred_hybrid,\n",
" squared=True)),\n",
- " np.corrcoef(ratings_test3.Rating, pred_hybrid)[0,1]))\n",
+ " np.corrcoef(ratings_test3[\"Rating\"], pred_hybrid)[0,1]))\n",
"\n",
- "pred_contentbased = m_contentbased.predict_new(user_attr_test.loc[ratings_test3.UserId],\n",
- " item_attr_test.loc[ratings_test3.ItemId])\n",
+ "pred_contentbased = m_contentbased.predict_new(user_attr_test.loc[ratings_test3[\"UserId\"]],\n",
+ " item_attr_test.loc[ratings_test3[\"ItemId\"]])\n",
"print(\"RMSE type 3 content-based model: %.3f [rho: %.3f]\" %\n",
- " (np.sqrt(mean_squared_error(ratings_test3.Rating,\n",
+ " (np.sqrt(mean_squared_error(ratings_test3[\"Rating\"],\n",
" pred_contentbased,\n",
" squared=True)),\n",
- " np.corrcoef(ratings_test3.Rating, pred_contentbased)[0,1]))"
+ " np.corrcoef(ratings_test3[\"Rating\"], pred_contentbased)[0,1]))"
]
},
{
@@ -1860,18 +1872,18 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "RMSE type 4 content-based model: 0.986 [rho: 0.462]\n"
+ "RMSE type 4 content-based model: 0.986 [rho: 0.464]\n"
]
}
],
"source": [
- "pred_contentbased = m_contentbased.predict_new(user_attr_test.loc[ratings_test4.UserId],\n",
- " item_attr_test.loc[ratings_test4.ItemId])\n",
+ "pred_contentbased = m_contentbased.predict_new(user_attr_test.loc[ratings_test4[\"UserId\"]],\n",
+ " item_attr_test.loc[ratings_test4[\"ItemId\"]])\n",
"print(\"RMSE type 4 content-based model: %.3f [rho: %.3f]\" %\n",
- " (np.sqrt(mean_squared_error(ratings_test4.Rating,\n",
+ " (np.sqrt(mean_squared_error(ratings_test4[\"Rating\"],\n",
" pred_contentbased,\n",
" squared=True)),\n",
- " np.corrcoef(ratings_test4.Rating, pred_contentbased)[0,1]))"
+ " np.corrcoef(ratings_test4[\"Rating\"], pred_contentbased)[0,1]))"
]
},
{
@@ -1913,10 +1925,10 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "RMSE type 1 ratings-only model: 0.897 [rho: 0.603]\n",
- "RMSE type 1 ratings + implicit + dyn + Chol: 0.853 [rho: 0.647]\n",
- "RMSE type 1 hybrid model: 0.860 [rho: 0.641]\n",
- "RMSE type 1 hybrid + implicit + dyn + Chol: 0.847 [rho: 0.653]\n"
+ "RMSE type 1 ratings-only model: 0.896 [rho: 0.603]\n",
+ "RMSE type 1 ratings + implicit + dyn + Chol: 0.853 [rho: 0.646]\n",
+ "RMSE type 1 hybrid model: 0.861 [rho: 0.640]\n",
+ "RMSE type 1 hybrid + implicit + dyn + Chol: 0.846 [rho: 0.654]\n"
]
}
],
@@ -1933,35 +1945,35 @@
" U=user_attr_train,\n",
" I=item_attr_train)\n",
"\n",
- "pred_ratingsonly = m_classic.predict(ratings_test1.UserId, ratings_test1.ItemId)\n",
+ "pred_ratingsonly = m_classic.predict(ratings_test1[\"UserId\"], ratings_test1[\"ItemId\"])\n",
"print(\"RMSE type 1 ratings-only model: %.3f [rho: %.3f]\" %\n",
- " (np.sqrt(mean_squared_error(ratings_test1.Rating,\n",
+ " (np.sqrt(mean_squared_error(ratings_test1[\"Rating\"],\n",
" pred_ratingsonly,\n",
" squared=True)),\n",
- " np.corrcoef(ratings_test1.Rating, pred_ratingsonly)[0,1]))\n",
+ " np.corrcoef(ratings_test1[\"Rating\"], pred_ratingsonly)[0,1]))\n",
"\n",
- "pred_implicit = m_implicit.predict(ratings_test1.UserId, ratings_test1.ItemId)\n",
+ "pred_implicit = m_implicit.predict(ratings_test1[\"UserId\"], ratings_test1[\"ItemId\"])\n",
"print(\"RMSE type 1 ratings + implicit + dyn + Chol: %.3f [rho: %.3f]\" %\n",
- " (np.sqrt(mean_squared_error(ratings_test1.Rating,\n",
+ " (np.sqrt(mean_squared_error(ratings_test1[\"Rating\"],\n",
" pred_implicit,\n",
" squared=True)),\n",
- " np.corrcoef(ratings_test1.Rating, pred_implicit)[0,1]))\n",
+ " np.corrcoef(ratings_test1[\"Rating\"], pred_implicit)[0,1]))\n",
"\n",
- "pred_hybrid = m_collective.predict(ratings_test1.UserId, ratings_test1.ItemId)\n",
+ "pred_hybrid = m_collective.predict(ratings_test1[\"UserId\"], ratings_test1[\"ItemId\"])\n",
"print(\"RMSE type 1 hybrid model: %.3f [rho: %.3f]\" %\n",
- " (np.sqrt(mean_squared_error(ratings_test1.Rating,\n",
+ " (np.sqrt(mean_squared_error(ratings_test1[\"Rating\"],\n",
" pred_hybrid,\n",
" squared=True)),\n",
- " np.corrcoef(ratings_test1.Rating, pred_hybrid)[0,1]))\n",
+ " np.corrcoef(ratings_test1[\"Rating\"], pred_hybrid)[0,1]))\n",
"\n",
"\n",
"pred_implicit_plus_collective = m_implicit_plus_collective.\\\n",
- " predict(ratings_test1.UserId, ratings_test1.ItemId)\n",
+ " predict(ratings_test1[\"UserId\"], ratings_test1[\"ItemId\"])\n",
"print(\"RMSE type 1 hybrid + implicit + dyn + Chol: %.3f [rho: %.3f]\" %\n",
- " (np.sqrt(mean_squared_error(ratings_test1.Rating,\n",
+ " (np.sqrt(mean_squared_error(ratings_test1[\"Rating\"],\n",
" pred_implicit_plus_collective,\n",
" squared=True)),\n",
- " np.corrcoef(ratings_test1.Rating, pred_implicit_plus_collective)[0,1]))"
+ " np.corrcoef(ratings_test1[\"Rating\"], pred_implicit_plus_collective)[0,1]))"
]
},
{
@@ -1980,38 +1992,38 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "RMSE type 2 hybrid model: 1.023 [rho: 0.424]\n",
- "RMSE type 2 hybrid model + implicit + dyn + Chol: 0.999 [rho: 0.490] (might get worse)\n",
- "RMSE type 2 content-based model: 0.977 [rho: 0.484]\n"
+ "RMSE type 2 hybrid model: 1.025 [rho: 0.424]\n",
+ "RMSE type 2 hybrid model + implicit + dyn + Chol: 1.004 [rho: 0.480] (might get worse)\n",
+ "RMSE type 2 content-based model: 0.977 [rho: 0.486]\n"
]
}
],
"source": [
- "pred_hybrid = m_collective.predict_new(ratings_test2.UserId,\n",
- " item_attr_test.loc[ratings_test2.ItemId])\n",
+ "pred_hybrid = m_collective.predict_new(ratings_test2[\"UserId\"],\n",
+ " item_attr_test.loc[ratings_test2[\"ItemId\"]])\n",
"print(\"RMSE type 2 hybrid model: %.3f [rho: %.3f]\" %\n",
- " (np.sqrt(mean_squared_error(ratings_test2.Rating,\n",
+ " (np.sqrt(mean_squared_error(ratings_test2[\"Rating\"],\n",
" pred_hybrid,\n",
" squared=True)),\n",
- " np.corrcoef(ratings_test2.Rating, pred_hybrid)[0,1]))\n",
+ " np.corrcoef(ratings_test2[\"Rating\"], pred_hybrid)[0,1]))\n",
"\n",
"pred_implicit_plus_collective = \\\n",
" m_implicit_plus_collective\\\n",
- " .predict_new(ratings_test2.UserId,\n",
- " item_attr_test.loc[ratings_test2.ItemId])\n",
+ " .predict_new(ratings_test2[\"UserId\"],\n",
+ " item_attr_test.loc[ratings_test2[\"ItemId\"]])\n",
"print(\"RMSE type 2 hybrid model + implicit + dyn + Chol: %.3f [rho: %.3f] (might get worse)\" %\n",
- " (np.sqrt(mean_squared_error(ratings_test2.Rating,\n",
+ " (np.sqrt(mean_squared_error(ratings_test2[\"Rating\"],\n",
" pred_implicit_plus_collective,\n",
" squared=True)),\n",
- " np.corrcoef(ratings_test2.Rating, pred_implicit_plus_collective)[0,1]))\n",
+ " np.corrcoef(ratings_test2[\"Rating\"], pred_implicit_plus_collective)[0,1]))\n",
"\n",
- "pred_contentbased = m_contentbased.predict_new(user_attr_test.loc[ratings_test2.UserId],\n",
- " item_attr_test.loc[ratings_test2.ItemId])\n",
+ "pred_contentbased = m_contentbased.predict_new(user_attr_test.loc[ratings_test2[\"UserId\"]],\n",
+ " item_attr_test.loc[ratings_test2[\"ItemId\"]])\n",
"print(\"RMSE type 2 content-based model: %.3f [rho: %.3f]\" %\n",
- " (np.sqrt(mean_squared_error(ratings_test2.Rating,\n",
+ " (np.sqrt(mean_squared_error(ratings_test2[\"Rating\"],\n",
" pred_contentbased,\n",
" squared=True)),\n",
- " np.corrcoef(ratings_test2.Rating, pred_contentbased)[0,1]))"
+ " np.corrcoef(ratings_test2[\"Rating\"], pred_contentbased)[0,1]))"
]
},
{
@@ -2024,38 +2036,38 @@
"output_type": "stream",
"text": [
"RMSE type 3 hybrid model: 0.988 [rho: 0.470]\n",
- "RMSE type 3 hybrid model + implicit + dyn + Chol: 1.014 [rho: 0.457] (got worse)\n",
+ "RMSE type 3 hybrid model + implicit + dyn + Chol: 1.013 [rho: 0.458] (got worse)\n",
"RMSE type 3 content-based model: 0.981 [rho: 0.468]\n"
]
}
],
"source": [
- "pred_hybrid = m_collective.predict_cold_multiple(item=ratings_test3.ItemId,\n",
- " U=user_attr_test.loc[ratings_test3.UserId])\n",
+ "pred_hybrid = m_collective.predict_cold_multiple(item=ratings_test3[\"ItemId\"],\n",
+ " U=user_attr_test.loc[ratings_test3[\"UserId\"]])\n",
"print(\"RMSE type 3 hybrid model: %.3f [rho: %.3f]\" %\n",
- " (np.sqrt(mean_squared_error(ratings_test3.Rating,\n",
+ " (np.sqrt(mean_squared_error(ratings_test3[\"Rating\"],\n",
" pred_hybrid,\n",
" squared=True)),\n",
- " np.corrcoef(ratings_test3.Rating, pred_hybrid)[0,1]))\n",
+ " np.corrcoef(ratings_test3[\"Rating\"], pred_hybrid)[0,1]))\n",
"\n",
"\n",
"pred_implicit_plus_collective = \\\n",
" m_implicit_plus_collective\\\n",
- " .predict_cold_multiple(item=ratings_test3.ItemId,\n",
- " U=user_attr_test.loc[ratings_test3.UserId])\n",
+ " .predict_cold_multiple(item=ratings_test3[\"ItemId\"],\n",
+ " U=user_attr_test.loc[ratings_test3[\"UserId\"]])\n",
"print(\"RMSE type 3 hybrid model + implicit + dyn + Chol: %.3f [rho: %.3f] (got worse)\" %\n",
- " (np.sqrt(mean_squared_error(ratings_test3.Rating,\n",
+ " (np.sqrt(mean_squared_error(ratings_test3[\"Rating\"],\n",
" pred_implicit_plus_collective,\n",
" squared=True)),\n",
- " np.corrcoef(ratings_test3.Rating, pred_implicit_plus_collective)[0,1]))\n",
+ " np.corrcoef(ratings_test3[\"Rating\"], pred_implicit_plus_collective)[0,1]))\n",
"\n",
- "pred_contentbased = m_contentbased.predict_new(user_attr_test.loc[ratings_test3.UserId],\n",
- " item_attr_test.loc[ratings_test3.ItemId])\n",
+ "pred_contentbased = m_contentbased.predict_new(user_attr_test.loc[ratings_test3[\"UserId\"]],\n",
+ " item_attr_test.loc[ratings_test3[\"ItemId\"]])\n",
"print(\"RMSE type 3 content-based model: %.3f [rho: %.3f]\" %\n",
- " (np.sqrt(mean_squared_error(ratings_test3.Rating,\n",
+ " (np.sqrt(mean_squared_error(ratings_test3[\"Rating\"],\n",
" pred_contentbased,\n",
" squared=True)),\n",
- " np.corrcoef(ratings_test3.Rating, pred_contentbased)[0,1]))"
+ " np.corrcoef(ratings_test3[\"Rating\"], pred_contentbased)[0,1]))"
]
},
{
@@ -2075,7 +2087,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "base",
"language": "python",
"name": "python3"
},
@@ -2089,7 +2101,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.7"
+ "version": "3.11.0"
}
},
"nbformat": 4,
diff --git a/example/load_data.ipynb b/example/load_data.ipynb
index 017b83a..4decacf 100644
--- a/example/load_data.ipynb
+++ b/example/load_data.ipynb
@@ -98,8 +98,11 @@
"source": [
"import numpy as np, pandas as pd, re\n",
"\n",
- "ratings = pd.read_table('ml-1m/ratings.dat', sep='::',\n",
- " engine='python', names=['UserId','ItemId','Rating','Timestamp'])\n",
+ "ratings = pd.read_table(\n",
+ " 'ml-1m/ratings.dat',\n",
+ " sep='::', engine='python',\n",
+ " names=['UserId','ItemId','Rating','Timestamp']\n",
+ ")\n",
"ratings = ratings.drop(\"Timestamp\", axis=1)\n",
"ratings.head()"
]
@@ -120,9 +123,9 @@
}
],
"source": [
- "print(\"Number of users: %d\" % ratings.UserId.drop_duplicates().count())\n",
- "print(\"Number of items: %d\" % ratings.ItemId.drop_duplicates().count())\n",
- "print(\"Number of ratings: %d\" % ratings.Rating.count())"
+ "print(\"Number of users: %d\" % ratings[\"UserId\"].nunique())\n",
+ "print(\"Number of items: %d\" % ratings[\"ItemId\"].nunique())\n",
+ "print(\"Number of ratings: %d\" % ratings[\"Rating\"].count())"
]
},
{
@@ -207,9 +210,11 @@
}
],
"source": [
- "movie_titles = pd.read_table('ml-1m/movies.dat',\n",
- " sep='::', engine='python', header=None)\n",
- "movie_titles.columns = ['ItemId', 'title', 'genres']\n",
+ "movie_titles = pd.read_table(\n",
+ " 'ml-1m/movies.dat',\n",
+ " sep='::', engine='python', header=None, encoding='latin_1',\n",
+ " names=['ItemId', 'title', 'genres']\n",
+ ")\n",
"movie_titles = movie_titles[['ItemId', 'title']]\n",
"\n",
"movie_titles.head()"
@@ -221,7 +226,7 @@
"metadata": {},
"outputs": [],
"source": [
- "movie_id_to_title = {i.ItemId:i.title for i in movie_titles.itertuples()}"
+ "movie_id_to_title = {i.ItemId: i.title for i in movie_titles.itertuples()}"
]
},
{
@@ -444,7 +449,7 @@
"\n",
"tags = pd.read_csv('ml-25m/genome-scores.csv')\n",
"tags_wide = tags.pivot(index='movieId', columns='tagId', values='relevance')\n",
- "tags_wide.columns=[\"tag\"+str(i) for i in tags_wide.columns.values]\n",
+ "tags_wide.columns=[\"tag\"+str(i) for i in tags_wide.columns]\n",
"\n",
"item_side_info = pd.merge(movies, tags_wide, how='inner', left_on='movieId', right_index=True)\n",
"item_side_info = item_side_info.drop('movieId', axis=1)\n",
@@ -485,7 +490,6 @@
" \n",
" | \n",
" ItemId | \n",
- " pc0 | \n",
" pc1 | \n",
" pc2 | \n",
" pc3 | \n",
@@ -494,8 +498,8 @@
" pc6 | \n",
" pc7 | \n",
" pc8 | \n",
+ " pc9 | \n",
" ... | \n",
- " pc40 | \n",
" pc41 | \n",
" pc42 | \n",
" pc43 | \n",
@@ -505,128 +509,129 @@
" pc47 | \n",
" pc48 | \n",
" pc49 | \n",
+ " pc50 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
- " 1.192433 | \n",
- " 2.034965 | \n",
- " 2.679781 | \n",
- " 1.154823 | \n",
- " 0.715302 | \n",
- " 0.982528 | \n",
- " 1.251208 | \n",
- " -0.792800 | \n",
- " 1.605826 | \n",
+ " 1.193171 | \n",
+ " 2.085621 | \n",
+ " 2.634135 | \n",
+ " 1.156088 | \n",
+ " 0.721649 | \n",
+ " 0.995436 | \n",
+ " 1.250474 | \n",
+ " -0.779532 | \n",
+ " 1.616702 | \n",
" ... | \n",
- " -0.322325 | \n",
- " -0.082968 | \n",
- " -0.031470 | \n",
- " -0.220287 | \n",
- " 0.207028 | \n",
- " 0.044198 | \n",
- " 0.273854 | \n",
- " -0.209990 | \n",
- " 0.035795 | \n",
- " -0.159606 | \n",
+ " -0.317134 | \n",
+ " -0.070338 | \n",
+ " -0.019553 | \n",
+ " 0.169051 | \n",
+ " 0.201415 | \n",
+ " -0.094831 | \n",
+ " -0.250461 | \n",
+ " -0.149919 | \n",
+ " -0.031735 | \n",
+ " -0.177708 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
- " -1.333200 | \n",
- " 1.719346 | \n",
- " 1.383137 | \n",
- " 0.788332 | \n",
- " -0.487431 | \n",
- " 0.376546 | \n",
- " 0.803104 | \n",
- " -0.606602 | \n",
- " 0.914494 | \n",
+ " -1.333533 | \n",
+ " 1.743796 | \n",
+ " 1.352161 | \n",
+ " 0.795724 | \n",
+ " -0.484175 | \n",
+ " 0.380645 | \n",
+ " 0.804462 | \n",
+ " -0.598527 | \n",
+ " 0.917250 | \n",
" ... | \n",
- " 0.278489 | \n",
- " -0.293607 | \n",
- " 0.028680 | \n",
- " -0.030128 | \n",
- " 0.311445 | \n",
- " 0.353925 | \n",
- " -0.318455 | \n",
- " 0.098478 | \n",
- " -0.078716 | \n",
- " 0.049872 | \n",
+ " 0.300060 | \n",
+ " -0.261956 | \n",
+ " 0.054457 | \n",
+ " 0.003863 | \n",
+ " 0.304605 | \n",
+ " -0.315796 | \n",
+ " 0.360203 | \n",
+ " 0.152770 | \n",
+ " 0.144790 | \n",
+ " -0.096549 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
- " -1.363421 | \n",
- " -0.034093 | \n",
- " 0.528633 | \n",
- " -0.312122 | \n",
- " 0.468820 | \n",
- " 0.164593 | \n",
- " 0.021909 | \n",
- " 0.161554 | \n",
- " -0.231992 | \n",
+ " -1.363395 | \n",
+ " -0.017107 | \n",
+ " 0.530395 | \n",
+ " -0.316202 | \n",
+ " 0.469430 | \n",
+ " 0.164630 | \n",
+ " 0.019083 | \n",
+ " 0.159188 | \n",
+ " -0.232969 | \n",
" ... | \n",
- " 0.217242 | \n",
- " -0.103778 | \n",
- " -0.290084 | \n",
- " -0.033624 | \n",
- " 0.076278 | \n",
- " 0.224247 | \n",
- " 0.159550 | \n",
- " -0.091091 | \n",
- " -0.134674 | \n",
- " -0.193942 | \n",
+ " 0.215020 | \n",
+ " -0.060682 | \n",
+ " -0.280852 | \n",
+ " 0.001087 | \n",
+ " 0.084960 | \n",
+ " -0.257190 | \n",
+ " -0.136963 | \n",
+ " -0.113914 | \n",
+ " 0.128352 | \n",
+ " -0.203658 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
- " -1.238094 | \n",
- " -1.014399 | \n",
- " 0.790394 | \n",
- " -0.296004 | \n",
- " -0.095043 | \n",
- " -0.052266 | \n",
- " -0.180244 | \n",
- " -0.768811 | \n",
- " -0.400559 | \n",
+ " -1.237840 | \n",
+ " -0.993731 | \n",
+ " 0.809815 | \n",
+ " -0.303009 | \n",
+ " -0.088991 | \n",
+ " -0.049621 | \n",
+ " -0.179544 | \n",
+ " -0.771278 | \n",
+ " -0.400499 | \n",
" ... | \n",
- " 0.073494 | \n",
- " 0.037196 | \n",
- " -0.225767 | \n",
- " -0.398071 | \n",
- " 0.275756 | \n",
- " -0.335302 | \n",
- " 0.254760 | \n",
- " -0.136116 | \n",
- " -0.462383 | \n",
- " 0.485561 | \n",
+ " 0.066207 | \n",
+ " 0.056054 | \n",
+ " -0.223027 | \n",
+ " 0.400157 | \n",
+ " 0.292300 | \n",
+ " 0.260936 | \n",
+ " -0.307608 | \n",
+ " -0.224141 | \n",
+ " 0.488955 | \n",
+ " 0.439189 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
- " -1.613220 | \n",
- " -0.280142 | \n",
- " 1.119149 | \n",
- " -0.130238 | \n",
- " 0.397091 | \n",
- " 0.187158 | \n",
- " 0.108864 | \n",
- " -0.273748 | \n",
- " -0.260166 | \n",
+ " -1.611499 | \n",
+ " -0.251899 | \n",
+ " 1.126443 | \n",
+ " -0.135702 | \n",
+ " 0.403340 | \n",
+ " 0.187289 | \n",
+ " 0.108451 | \n",
+ " -0.275341 | \n",
+ " -0.261142 | \n",
" ... | \n",
- " 0.113957 | \n",
- " -0.123240 | \n",
- " -0.243951 | \n",
- " -0.489377 | \n",
- " -0.024730 | \n",
- " 0.095848 | \n",
- " 0.227061 | \n",
- " -0.296050 | \n",
- " -0.030863 | \n",
- " -0.072919 | \n",
+ " 0.109560 | \n",
+ " -0.086042 | \n",
+ " -0.236327 | \n",
+ " 0.461589 | \n",
+ " 0.013350 | \n",
+ " -0.192557 | \n",
+ " -0.234025 | \n",
+ " -0.369643 | \n",
+ " -0.041060 | \n",
+ " -0.074656 | \n",
"
\n",
" \n",
"\n",
@@ -634,26 +639,26 @@
""
],
"text/plain": [
- " ItemId pc0 pc1 pc2 pc3 pc4 pc5 \\\n",
- "0 1 1.192433 2.034965 2.679781 1.154823 0.715302 0.982528 \n",
- "1 2 -1.333200 1.719346 1.383137 0.788332 -0.487431 0.376546 \n",
- "2 3 -1.363421 -0.034093 0.528633 -0.312122 0.468820 0.164593 \n",
- "3 4 -1.238094 -1.014399 0.790394 -0.296004 -0.095043 -0.052266 \n",
- "4 5 -1.613220 -0.280142 1.119149 -0.130238 0.397091 0.187158 \n",
+ " ItemId pc1 pc2 pc3 pc4 pc5 pc6 \\\n",
+ "0 1 1.193171 2.085621 2.634135 1.156088 0.721649 0.995436 \n",
+ "1 2 -1.333533 1.743796 1.352161 0.795724 -0.484175 0.380645 \n",
+ "2 3 -1.363395 -0.017107 0.530395 -0.316202 0.469430 0.164630 \n",
+ "3 4 -1.237840 -0.993731 0.809815 -0.303009 -0.088991 -0.049621 \n",
+ "4 5 -1.611499 -0.251899 1.126443 -0.135702 0.403340 0.187289 \n",
"\n",
- " pc6 pc7 pc8 ... pc40 pc41 pc42 pc43 \\\n",
- "0 1.251208 -0.792800 1.605826 ... -0.322325 -0.082968 -0.031470 -0.220287 \n",
- "1 0.803104 -0.606602 0.914494 ... 0.278489 -0.293607 0.028680 -0.030128 \n",
- "2 0.021909 0.161554 -0.231992 ... 0.217242 -0.103778 -0.290084 -0.033624 \n",
- "3 -0.180244 -0.768811 -0.400559 ... 0.073494 0.037196 -0.225767 -0.398071 \n",
- "4 0.108864 -0.273748 -0.260166 ... 0.113957 -0.123240 -0.243951 -0.489377 \n",
+ " pc7 pc8 pc9 ... pc41 pc42 pc43 pc44 \\\n",
+ "0 1.250474 -0.779532 1.616702 ... -0.317134 -0.070338 -0.019553 0.169051 \n",
+ "1 0.804462 -0.598527 0.917250 ... 0.300060 -0.261956 0.054457 0.003863 \n",
+ "2 0.019083 0.159188 -0.232969 ... 0.215020 -0.060682 -0.280852 0.001087 \n",
+ "3 -0.179544 -0.771278 -0.400499 ... 0.066207 0.056054 -0.223027 0.400157 \n",
+ "4 0.108451 -0.275341 -0.261142 ... 0.109560 -0.086042 -0.236327 0.461589 \n",
"\n",
- " pc44 pc45 pc46 pc47 pc48 pc49 \n",
- "0 0.207028 0.044198 0.273854 -0.209990 0.035795 -0.159606 \n",
- "1 0.311445 0.353925 -0.318455 0.098478 -0.078716 0.049872 \n",
- "2 0.076278 0.224247 0.159550 -0.091091 -0.134674 -0.193942 \n",
- "3 0.275756 -0.335302 0.254760 -0.136116 -0.462383 0.485561 \n",
- "4 -0.024730 0.095848 0.227061 -0.296050 -0.030863 -0.072919 \n",
+ " pc45 pc46 pc47 pc48 pc49 pc50 \n",
+ "0 0.201415 -0.094831 -0.250461 -0.149919 -0.031735 -0.177708 \n",
+ "1 0.304605 -0.315796 0.360203 0.152770 0.144790 -0.096549 \n",
+ "2 0.084960 -0.257190 -0.136963 -0.113914 0.128352 -0.203658 \n",
+ "3 0.292300 0.260936 -0.307608 -0.224141 0.488955 0.439189 \n",
+ "4 0.013350 -0.192557 -0.234025 -0.369643 -0.041060 -0.074656 \n",
"\n",
"[5 rows x 51 columns]"
]
@@ -670,10 +675,12 @@
"item_sideinfo_reduced = item_side_info.drop(\"ItemId\", axis=1)\n",
"item_sideinfo_pca = pca_obj.fit_transform(item_sideinfo_reduced)\n",
"\n",
- "item_sideinfo_pca = pd.DataFrame(item_sideinfo_pca)\n",
- "item_sideinfo_pca.columns = [\"pc\"+str(i) for i in range(item_sideinfo_pca.shape[1])]\n",
- "item_sideinfo_pca['ItemId'] = item_side_info.ItemId.values.copy()\n",
- "item_sideinfo_pca = item_sideinfo_pca[[\"ItemId\"] + [cl for cl in item_sideinfo_pca.columns if cl != \"ItemId\"]]\n",
+ "item_sideinfo_pca = pd.DataFrame(\n",
+ " item_sideinfo_pca,\n",
+ " columns=[\"pc\"+str(i+1) for i in range(item_sideinfo_pca.shape[1])]\n",
+ ")\n",
+ "item_sideinfo_pca['ItemId'] = item_side_info[\"ItemId\"].to_numpy()\n",
+ "item_sideinfo_pca = item_sideinfo_pca[[\"ItemId\"] + item_sideinfo_pca.columns[:50].tolist()]\n",
"item_sideinfo_pca.head()"
]
},
@@ -686,13 +693,13 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Number of items from MovieLens 1M with side info: 3065\n"
+ "Number of items from MovieLens 1M with side info: 3080\n"
]
}
],
"source": [
"print(\"Number of items from MovieLens 1M with side info: %d\" %\n",
- " ratings.ItemId[np.in1d(ratings.ItemId, item_sideinfo_pca.ItemId)].drop_duplicates().count())"
+ " ratings[\"ItemId\"][np.in1d(ratings[\"ItemId\"], item_sideinfo_pca[\"ItemId\"])].nunique())"
]
},
{
@@ -709,7 +716,7 @@
"outputs": [],
"source": [
"zipcode_abbs = pd.read_csv(\"states.csv\", low_memory=False)\n",
- "zipcode_abbs_dct = {z.State:z.Abbreviation for z in zipcode_abbs.itertuples()}\n",
+ "zipcode_abbs_dct = {z.State: z.Abbreviation for z in zipcode_abbs.itertuples()}\n",
"us_regs_table = [\n",
" ('New England', 'Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island, Vermont'),\n",
" ('Middle Atlantic', 'Delaware, Maryland, New Jersey, New York, Pennsylvania'),\n",
@@ -809,12 +816,14 @@
"source": [
"zipcode_info = pd.read_csv(\"free-zipcode-database.csv\", low_memory=False)\n",
"zipcode_info = zipcode_info.groupby('Zipcode').first().reset_index()\n",
- "zipcode_info.loc[zipcode_info.Country != \"US\", 'State'] = 'UnknownOrNonUS'\n",
+ "zipcode_info.loc[lambda x: x[\"Country\"] != \"US\", 'State'] = 'UnknownOrNonUS'\n",
"zipcode_info['Region'] = zipcode_info['State'].copy()\n",
- "zipcode_info.loc[zipcode_info.Country == \"US\", 'Region'] = \\\n",
- " zipcode_info.Region\\\n",
- " .loc[zipcode_info.Country == \"US\"]\\\n",
- " .map(lambda x: us_regs_dct[x] if x in us_regs_dct else 'UsOther')\n",
+ "zipcode_info.loc[lambda x: x[\"Country\"] == \"US\", \"Region\"] = (\n",
+ " zipcode_info\n",
+ " .loc[lambda x: x[\"Country\"] == \"US\"]\n",
+ " [\"Region\"]\n",
+ " .map(lambda x: us_regs_dct[x] if x in us_regs_dct else 'UsOther')\n",
+ ")\n",
"zipcode_info = zipcode_info[['Zipcode', 'Region']]\n",
"zipcode_info.head()"
]
@@ -925,11 +934,14 @@
}
],
"source": [
- "users = pd.read_table('ml-1m/users.dat',\n",
- " sep='::', names=[\"UserId\", \"Gender\", \"Age\", \"Occupation\", \"Zipcode\"], engine='python')\n",
- "users[\"Zipcode\"] = users.Zipcode.map(lambda x: np.int(re.sub(\"-.*\",\"\",x)))\n",
- "users = pd.merge(users,zipcode_info,on='Zipcode',how='left')\n",
- "users['Region'] = users.Region.fillna('UnknownOrNonUS')\n",
+ "users = pd.read_table(\n",
+ " 'ml-1m/users.dat',\n",
+ " sep='::', engine='python', encoding='cp1252',\n",
+ " names=[\"UserId\", \"Gender\", \"Age\", \"Occupation\", \"Zipcode\"]\n",
+ ")\n",
+ "users[\"Zipcode\"] = users[\"Zipcode\"].map(lambda x: int(re.sub(\"-.*\", \"\", x)))\n",
+ "users = pd.merge(users, zipcode_info, on='Zipcode', how='left')\n",
+ "users['Region'] = users[\"Region\"].fillna('UnknownOrNonUS')\n",
"\n",
"occupations = {\n",
" 0: \"\\\"other\\\" or not specified\",\n",
@@ -954,8 +966,8 @@
" 19: \"unemployed\",\n",
" 20: \"writer\"\n",
"}\n",
- "users['Occupation'] = users.Occupation.map(occupations)\n",
- "users['Age'] = users.Age.map(lambda x: str(x))\n",
+ "users['Occupation'] = users[\"Occupation\"].map(occupations)\n",
+ "users['Age'] = users[\"Age\"].map(lambda x: str(x))\n",
"users.head()"
]
},
@@ -1012,122 +1024,122 @@
" \n",
" 0 | \n",
" 1 | \n",
- " 1 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
" ... | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
" ... | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
" ... | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
" ... | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
" ... | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
"
\n",
" \n",
"\n",
@@ -1136,32 +1148,32 @@
],
"text/plain": [
" UserId Gender_F Gender_M Age_1 Age_18 Age_25 Age_35 Age_45 Age_50 \\\n",
- "0 1 1 0 1 0 0 0 0 0 \n",
- "1 2 0 1 0 0 0 0 0 0 \n",
- "2 3 0 1 0 0 1 0 0 0 \n",
- "3 4 0 1 0 0 0 0 1 0 \n",
- "4 5 0 1 0 0 1 0 0 0 \n",
+ "0 1 True False True False False False False False \n",
+ "1 2 False True False False False False False False \n",
+ "2 3 False True False False True False False False \n",
+ "3 4 False True False False False False True False \n",
+ "4 5 False True False False True False False False \n",
"\n",
" Age_56 ... Occupation_unemployed Occupation_writer \\\n",
- "0 0 ... 0 0 \n",
- "1 1 ... 0 0 \n",
- "2 0 ... 0 0 \n",
- "3 0 ... 0 0 \n",
- "4 0 ... 0 1 \n",
+ "0 False ... False False \n",
+ "1 True ... False False \n",
+ "2 False ... False False \n",
+ "3 False ... False False \n",
+ "4 False ... False True \n",
"\n",
" Region_Middle Atlantic Region_Midwest Region_New England Region_South \\\n",
- "0 0 1 0 0 \n",
- "1 0 0 0 1 \n",
- "2 0 1 0 0 \n",
- "3 0 0 1 0 \n",
- "4 0 1 0 0 \n",
+ "0 False True False False \n",
+ "1 False False False True \n",
+ "2 False True False False \n",
+ "3 False False True False \n",
+ "4 False True False False \n",
"\n",
" Region_Southwest Region_UnknownOrNonUS Region_UsOther Region_West \n",
- "0 0 0 0 0 \n",
- "1 0 0 0 0 \n",
- "2 0 0 0 0 \n",
- "3 0 0 0 0 \n",
- "4 0 0 0 0 \n",
+ "0 False False False False \n",
+ "1 False False False False \n",
+ "2 False False False False \n",
+ "3 False False False False \n",
+ "4 False False False False \n",
"\n",
"[5 rows x 39 columns]"
]
@@ -1191,7 +1203,7 @@
],
"source": [
"print(\"Number of users with demographic information: %d\" %\n",
- " user_side_info.UserId.drop_duplicates().count())"
+ " user_side_info[\"UserId\"].nunique())"
]
},
{
@@ -1232,7 +1244,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.5"
+ "version": "3.11.0"
}
},
"nbformat": 4,
diff --git a/requirements.txt b/requirements.txt
index 9295cf0..4206eea 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
cython
-numpy>=1.18.1
+numpy>=1.25
scipy
-pandas>=0.25.0
+pandas
threadpoolctl
diff --git a/setup.py b/setup.py
index 3c0d46e..9b1981c 100644
--- a/setup.py
+++ b/setup.py
@@ -22,6 +22,7 @@ def set_omp_false():
if len(custom_blas_link_args) or len(custom_blas_compile_args):
build_ext_with_blas = build_ext
+use_findblas = False
if not (len(custom_blas_link_args) or len(custom_blas_compile_args)):
use_findblas = (("findblas" in sys.argv)
or ("-findblas" in sys.argv)
@@ -45,6 +46,18 @@ def set_omp_false():
except AttributeError:
EXIT_SUCCESS = 0
+## For debugging
+if "--asan" in sys.argv:
+ ADD_ASAN = True
+ sys.argv.remove("--asan")
+else:
+ ADD_ASAN = False
+if "--ggdb" in sys.argv:
+ ADD_GGDB = True
+ sys.argv.remove("--ggdb")
+else:
+ ADD_GGDB = False
+
class build_ext_subclass( build_ext_with_blas ):
def build_extensions(self):
is_windows = sys.platform[:3].lower() == "win"
@@ -69,16 +82,25 @@ def build_extensions(self):
if is_windows:
e.define_macros += [("NO_LONG_DOUBLE", None)]
-
- # e.extra_compile_args += ['-O2', '-fopenmp', '-march=native', '-std=c99', '-ggdb']
- # e.extra_link_args += ['-fopenmp']
- # e.extra_link_args += ['-fopenmp=libomp']
- # e.extra_compile_args += ['-O2', '-march=native', '-std=c99', '-ggdb']
-
+ if ADD_ASAN:
+ for e in self.extensions:
+ if self.compiler.compiler_type != "clang":
+ e.extra_compile_args += ["-fsanitize=address", "-static-libasan", "-ggdb"]
+ else:
+ e.extra_compile_args += ["-fsanitize=address", "-static-libsan", "-ggdb"]
+
+ elif ADD_GGDB:
+ for e in self.extensions:
+ e.extra_compile_args += ["-ggdb"]
+ e.define_macros += [("DEBUG", 1)]
- # e.extra_compile_args += ['-fsanitize=address', '-static-libasan', '-ggdb']
- # e.extra_link_args += ['-fsanitize=address', '-static-libasan']
+ if self.compiler.compiler_type == "clang":
+ e.extra_compile_args += [
+ "-Wno-unknown-pragmas",
+ "-Wno-unknown-attributes",
+ "-Wno-pass-failed",
+ ]
## If a custom BLAS/LAPACK is provided:
if len(custom_blas_link_args) or len(custom_blas_compile_args):
@@ -348,7 +370,7 @@ def test_supports_clang_reassociate(self):
setup(
name = "cmfrec",
packages = ["cmfrec"],
- version = '3.5.1-7',
+ version = '3.5.1-8',
description = 'Collective matrix factorization',
author = 'David Cortes',
url = 'https://github.com/david-cortes/cmfrec',
@@ -356,9 +378,9 @@ def test_supports_clang_reassociate(self):
'relational learning'],
install_requires=[
'cython',
- 'numpy>=1.17',
+ 'numpy>=1.25',
'scipy',
- 'pandas>=0.25.0',
+ 'pandas',
'findblas'
],
cmdclass = {'build_ext': build_ext_subclass},