diff --git a/NAMESPACE b/NAMESPACE index 59c51e6d..58e77579 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,12 +16,6 @@ S3method(circosPlot,block.plsda) S3method(circosPlot,block.spls) S3method(circosPlot,block.splsda) S3method(image,tune.rcc) -S3method(perf,assess.mint.plsda) -S3method(perf,assess.mint.splsda) -S3method(perf,assess.mixo_pls) -S3method(perf,assess.mixo_plsda) -S3method(perf,assess.mixo_spls) -S3method(perf,assess.mixo_splsda) S3method(perf,mint.pls) S3method(perf,mint.plsda) S3method(perf,mint.spls) @@ -31,6 +25,12 @@ S3method(perf,mixo_plsda) S3method(perf,mixo_spls) S3method(perf,mixo_splsda) S3method(perf,sgccda) +S3method(perf.assess,mint.plsda) +S3method(perf.assess,mint.splsda) +S3method(perf.assess,mixo_pls) +S3method(perf.assess,mixo_plsda) +S3method(perf.assess,mixo_spls) +S3method(perf.assess,mixo_splsda) S3method(perf.assess,sgccda) S3method(plot,pca) S3method(plot,perf.mint.plsda.mthd) @@ -169,9 +169,13 @@ export(spls) export(splsda) export(study_split) export(tune) +export(tune.block.plsda) export(tune.block.splsda) +export(tune.mint.plsda) export(tune.mint.splsda) export(tune.pca) +export(tune.pls) +export(tune.plsda) export(tune.rcc) export(tune.spca) export(tune.spls) diff --git a/R/image.tune.rcc.R b/R/image.tune.rcc.R index 2585ff56..f462c04d 100644 --- a/R/image.tune.rcc.R +++ b/R/image.tune.rcc.R @@ -24,7 +24,7 @@ #' Y <- nutrimouse$gene #' #' ## this can take some seconds -#' cv.score <- tune.rcc(X, Y, validation = "Mfold", plot = FALSE) +#' cv.score <- tune.rcc(X, Y, validation = "Mfold") #' plot(cv.score) #' #' # image(cv.score) # same result as plot() diff --git a/R/perf.R b/R/perf.R index 5e713cbd..9ef66147 100644 --- a/R/perf.R +++ b/R/perf.R @@ -110,6 +110,7 @@ #' Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'. #' Note 'seed' is not required or used in perf.mint.plsda as this method uses loo cross-validation #' @param ... not used +#' #' @return For PLS and sPLS models, \code{perf} produces a list with the #' following components for every repeat: #' \item{MSEP}{Mean Square Error Prediction for each \eqn{Y} variable, only @@ -141,19 +142,25 @@ #' \item{cor.tpred, cor.upred}{Correlation between the #' predicted and actual components for X (t) and Y (u)} #' \item{RSS.tpred, RSS.upred}{Residual Sum of Squares between the -#' predicted and actual components for X (t) and Y (u)} -#' \item{error.rate}{ For -#' PLS-DA and sPLS-DA models, \code{perf} produces a matrix of classification +#' predicted and actual components for X (t) and Y (u)} +#' +#' +#' +#' For PLS-DA and sPLS-DA models, \code{perf} produces a matrix of classification #' error rate estimation. The dimensions correspond to the components in the #' model and to the prediction method used, respectively. Note that error rates #' reported in any component include the performance of the model in earlier #' components for the specified \code{keepX} parameters (e.g. error rate #' reported for component 3 for \code{keepX = 20} already includes the fitted -#' model on components 1 and 2 for \code{keepX = 20}). For more advanced usage -#' of the \code{perf} function, see \url{www.mixomics.org/methods/spls-da/} and -#' consider using the \code{predict} function.} -#' \item{auc}{Averaged AUC values -#' over the \code{nrepeat}} +#' model on components 1 and 2 for \code{keepX = 20}). +#' \item{error.rate}{Prediction error rate for each dist and measure} +#' \item{auc}{AUC values per component averaged over the \code{nrepeat}} +#' \item{auc.all}{AUC values per component per repeat} +#' \item{predict}{A list of length ncomp that os predicted values of each sample for each class} +#' \item{features}{a list of features selected across the folds ($stable.X) for the keepX parameters from the input object.} +#' \item{choice.ncomp}{Otimal number of components for the model for each prediction distance using one-sided t-tests that test +#' for a significant difference in the mean error rate (gain in prediction) when components are added to the model.} +#' \item{class}{A list which gives the predicted class of each sample for each dist and each of the ncomp components} #' #' For mint.splsda models, \code{perf} produces the following outputs: #' \item{study.specific.error}{A list that gives BER, overall error rate and @@ -166,7 +173,7 @@ #' \item{auc}{AUC values} \item{auc.study}{AUC values for each study in mint #' models} #' -#' For sgccda models, \code{perf} produces the following outputs: +#' For sgccda models (i.e. block (s)PLS-DA models), \code{perf} produces the following outputs: #' \item{error.rate}{Prediction error rate for each block of \code{object$X} #' and each \code{dist}} \item{error.rate.per.class}{Prediction error rate for #' each block of \code{object$X}, each \code{dist} and each class} @@ -197,12 +204,14 @@ #' \item{WeightedVote.error.rate}{if more than one block, returns the error #' rate of the \code{WeightedVote} output} \item{weights}{Returns the weights #' of each block used for the weighted predictions, for each nrepeat and each -#' fold} \item{choice.ncomp}{For supervised models; returns the optimal number +#' fold} +#' \item{choice.ncomp}{For supervised models; returns the optimal number #' of components for the model for each prediction distance using one-sided #' t-tests that test for a significant difference in the mean error rate (gain #' in prediction) when components are added to the model. See more details in #' Rohart et al 2017 Suppl. For more than one block, an optimal ncomp is #' returned for each prediction framework.} +#' #' @author Ignacio González, Amrit Singh, Kim-Anh Lê Cao, Benoit Gautier, #' Florian Rohart, Al J Abadi #' @seealso \code{\link{predict}}, \code{\link{nipals}}, diff --git a/R/perf.assess.R b/R/perf.assess.R index 60740196..767283fa 100644 --- a/R/perf.assess.R +++ b/R/perf.assess.R @@ -34,16 +34,6 @@ #' that for PLS and sPLS objects, perf is performed on the pre-processed data #' after log ratio transform and multilevel analysis, if any. #' -#' Sparse methods. The sPLS, sPLS-DA and sgccda functions are run on several -#' and different subsets of data (the cross-folds) and will certainly lead to -#' different subset of selected features. Those are summarised in the output -#' \code{features$stable} (see output Value below) to assess how often the -#' variables are selected across all folds. Note that for PLS-DA and sPLS-DA -#' objects, perf is performed on the original data, i.e. before the -#' pre-processing step of the log ratio transform and multilevel analysis, if -#' any. In addition for these methods, the classification error rate is -#' averaged across all folds. -#' #' The mint.sPLS-DA function estimates errors based on Leave-one-group-out #' cross validation (where each levels of object$study is left out (and #' predicted) once) and provides study-specific outputs @@ -63,8 +53,7 @@ #' threshold based on distances (see \code{predict}) that optimally determine #' class membership of the samples tested. As such AUC and ROC are not needed #' to estimate the performance of the model. We provide those outputs as -#' complementary performance measures. See more details in our mixOmics -#' article. +#' complementary performance measures. #' #' Prediction distances. See details from \code{?predict}, and also our #' supplemental material in the mixOmics article. @@ -87,20 +76,20 @@ #' More details about the PLS modes in \code{?pls}. #' #' @param object object of class inherited from \code{"pls"}, \code{"plsda"}, -#' \code{"spls"}, \code{"splsda"} or \code{"mint.splsda"}. The function will +#' \code{"spls"}, \code{"splsda"}. \code{"sgccda"} or \code{"mint.splsda"}. The function will #' retrieve some key parameters stored in that object. +#' @param validation a character string. What kind of (internal) validation to use, +#' matching one of \code{"Mfold"} or \code{"loo"} (see below). Default is +#' \code{"Mfold"}. For MINT methods only \code{"loo"} will be used. +#' @param folds numeric. Number of folds in the Mfold cross-validation. See Details. +#' @param nrepeat numierc. Number of times the Cross-Validation process is repeated. +#' This is an important argument to ensure the estimation of the performance to +#' be as accurate as possible. Default it 1. #' @param dist only applies to an object inheriting from \code{"plsda"}, #' \code{"splsda"} or \code{"mint.splsda"} to evaluate the classification #' performance of the model. Should be a subset of \code{"max.dist"}, #' \code{"centroids.dist"}, \code{"mahalanobis.dist"}. Default is \code{"all"}. #' See \code{\link{predict}}. -#' @param validation character. What kind of (internal) validation to use, -#' matching one of \code{"Mfold"} or \code{"loo"} (see below). Default is -#' \code{"Mfold"}. -#' @param folds the folds in the Mfold cross-validation. See Details. -#' @param nrepeat Number of times the Cross-Validation process is repeated. -#' This is an important argument to ensure the estimation of the performance to -#' be as accurate as possible. #' @param auc if \code{TRUE} calculate the Area Under the Curve (AUC) #' performance of the model. #' @param progressBar by default set to \code{FALSE} to output the progress bar @@ -113,15 +102,16 @@ #' Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'. #' Note 'seed' is not required or used in perf.mint.plsda as this method uses loo cross-validation #' @param ... not used -#' @return For PLS and sPLS models, \code{perf} produces a list with the -#' following components for every repeat: + +#' @return For PLS and sPLS models: #' \item{MSEP}{Mean Square Error Prediction for each \eqn{Y} variable, only #' applies to object inherited from \code{"pls"}, and \code{"spls"}. Only #' available when in regression (s)PLS.} #' \item{RMSEP}{Root Mean Square Error Prediction for each \eqn{Y} variable, only #' applies to object inherited from \code{"pls"}, and \code{"spls"}. Only #' available when in regression (s)PLS.} -#' \item{R2}{a matrix of \eqn{R^2} values of the \eqn{Y}-variables. Only applies to object +#' \item{R2}{a matrix of \eqn{R^2} values of the \eqn{Y}-variables for models +#' with \eqn{1, \ldots ,}\code{ncomp} components, only applies to object #' inherited from \code{"pls"}, and \code{"spls"}. Only available when in #' regression (s)PLS.} #' \item{Q2}{if \eqn{Y} contains one variable, a vector of \eqn{Q^2} values @@ -129,41 +119,44 @@ #' Note that in the specific case of an sPLS model, it is better to have a look #' at the Q2.total criterion, only applies to object inherited from #' \code{"pls"}, and \code{"spls"}. Only available when in regression (s)PLS.} -#' \item{Q2.total}{a vector of \eqn{Q^2}-total values for model, only applies to object inherited from +#' \item{Q2.total}{a vector of \eqn{Q^2}-total values for models with \eqn{1, +#' \ldots ,}\code{ncomp} components, only applies to object inherited from #' \code{"pls"}, and \code{"spls"}. Available in both (s)PLS modes.} -#' \item{RSS}{Residual Sum of Squares across all selected features.} +#' \item{RSS}{Residual Sum of Squares across all selected features} #' \item{PRESS}{Predicted Residual Error Sum of Squares across all selected features} -#' \item{features}{a list of features selected across the -#' folds (\code{$stable.X} and \code{$stable.Y}) for the \code{keepX} and -#' \code{keepY} parameters from the input object. Note, this will be \code{NULL} -#' if using standard (non-sparse) PLS.} #' \item{cor.tpred, cor.upred}{Correlation between the #' predicted and actual components for X (t) and Y (u)} #' \item{RSS.tpred, RSS.upred}{Residual Sum of Squares between the -#' predicted and actual components for X (t) and Y (u)} -#' \item{error.rate}{ For -#' PLS-DA and sPLS-DA models, \code{perf} produces a matrix of classification -#' error rate estimation using overall and BER error rates across different distance methods. -#' Although error rates are only reported for the number of components used in the final model, -#' Note that are calculated including the performance of the model in a smaller number of -#' components for the specified \code{keepX} parameters (e.g. error rate -#' reported for component 3 for \code{keepX = 20} already includes the fitted -#' model on components 1 and 2 for \code{keepX = 20}). For more advanced usage -#' of the \code{perf} function, see \url{www.mixomics.org/methods/spls-da/} and -#' consider using the \code{predict} function.} -#' \item{auc}{Averaged AUC values -#' over the \code{nrepeat}} -#' -#' #' For sgccda models, \code{perf} produces the following outputs: +#' predicted and actual components for X (t) and Y (u)} +#' +#' +#' +#' For PLS-DA and sPLS-DA models: +#' \item{error.rate}{Prediction error rate for each dist and measure} +#' \item{auc}{AUC value averaged over the \code{nrepeat}} +#' \item{auc.all}{AUC values per repeat} +#' \item{predict}{Predicted values of each sample for each class} +#' \item{class}{A list which gives the predicted class of each sample for each dist and each of the ncomp components} +#' +#' For mint.splsda models: +#' \item{study.specific.error}{A list that gives BER, overall error rate and +#' error rate per class, for each study} +#' \item{global.error}{A list that gives +#' BER, overall error rate and error rate per class for all samples} +#' \item{predict}{A list of length \code{ncomp} that produces the predicted +#' values of each sample for each class} +#' \item{class}{A list which gives the +#' predicted class of each sample for each \code{dist}.} +#' \item{auc}{AUC values} \item{auc.study}{AUC values for each study in mint models} +#' +#' For sgccda models (i.e. block (s)PLS-DA models): #' \item{error.rate}{Prediction error rate for each block of \code{object$X} #' and each \code{dist}} #' \item{error.rate.per.class}{Prediction error rate for #' each block of \code{object$X}, each \code{dist} and each class} -#' \item{predict}{Predicted values of each sample for each class and each block.} -#' \item{class}{Predicted class of each sample for each block, each \code{dist}, and each nrepeat} -#' \item{features}{a list of features selected across the folds (\code{$stable.X} and -#' \code{$stable.Y}) for the \code{keepX} and \code{keepY} parameters from the -#' input object.} +#' \item{predict}{Predicted values of each sample for each class and each block} +#' \item{class}{Predicted class of each sample for each +#' block, each \code{dist}, and each nrepeat} #' \item{AveragedPredict.class}{if more than one block, returns #' the average predicted class over the blocks (averaged of the \code{Predict} #' output and prediction using the \code{max.dist} distance)} @@ -187,15 +180,6 @@ #' rate of the \code{WeightedVote} output} #' \item{weights}{Returns the weights of each block used for the weighted predictions, for each nrepeat and each #' fold} -#' -#' For mint.splsda models, \code{perf} produces the following outputs: -#' \item{study.specific.error}{A list that gives BER, overall error rate and -#' error rate per class, for each study} -#' \item{global.error}{A list that gives BER, overall error rate and error rate per class for all samples} -#' \item{predict}{A list of the predicted values of each sample for each class} -#' \item{class}{A list which gives the predicted class of each sample for each \code{dist}. Directly obtained from the \code{predict} output.} -#' \item{auc}{AUC values} \item{auc.study}{AUC values for each study} -#' #' @author Ignacio González, Amrit Singh, Kim-Anh Lê Cao, Benoit Gautier, #' Florian Rohart, Al J Abadi @@ -247,7 +231,7 @@ #' 4:e1845. #' @keywords regression multivariate #' @export -#' @example ./examples/perf-examples.R +#' @example ./examples/perf.assess-examples.R ## ------------------------------- Generic -------------------------------- ## perf.assess <- function(object, ...) UseMethod("perf.assess") \ No newline at end of file diff --git a/R/perf.assess.diablo.R b/R/perf.assess.diablo.R index 1673d75e..65bc6723 100644 --- a/R/perf.assess.diablo.R +++ b/R/perf.assess.diablo.R @@ -33,7 +33,6 @@ # folds - number of folds if validation = "Mfold" # ---------------------------------------------------------------------------------------------------------- #' @rdname perf.assess -#' @importFrom utils relist #' @method perf.assess sgccda #' @export perf.assess.sgccda <- diff --git a/R/perf.assess.mint.plsda.R b/R/perf.assess.mint.plsda.R index 4863d3e7..a635d502 100644 --- a/R/perf.assess.mint.plsda.R +++ b/R/perf.assess.mint.plsda.R @@ -1,5 +1,41 @@ -## -------------------------- perf.mint(s)plsda --------------------------- ## +############################################################################################################# +# Authors: +# Amrit Singh, University of British Columbia, Vancouver. +# Florian Rohart, The University of Queensland, The University of Queensland Diamantina Institute, Translational Research Institute, Brisbane, QLD +# Kim-Anh Le Cao, The University of Queensland, The University of Queensland Diamantina Institute, Translational Research Institute, Brisbane, QLD +# +# created: 01-04-2015 +# last modified: 27-05-2016 +# +# Copyright (C) 2015 +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +############################################################################################################# + + +# ---------------------------------------------------------------------------------------------------------- +# perf.assess.mint.plsda - Function to evaluate the performance of the fitted PLS (cross-validation) +# inputs: object - object obtain from running mint.plsda +# dist - to evaluate the classification performance +# validation - type of validation +# folds - number of folds if validation = "Mfold" +# ---------------------------------------------------------------------------------------------------------- +#' ## -------------------------- perf.mint(s)plsda --------------------------- ## + #' @rdname perf.assess +#' @method perf.assess mint.plsda #' @export perf.assess.mint.plsda <- function (object, dist = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"), @@ -259,5 +295,6 @@ perf.assess.mint.plsda <- function (object, } #' @rdname perf.assess +#' @method perf.assess mint.splsda #' @export perf.assess.mint.splsda <- perf.assess.mint.plsda \ No newline at end of file diff --git a/R/perf.assess.pls.R b/R/perf.assess.pls.R index a015d924..1fb16737 100644 --- a/R/perf.assess.pls.R +++ b/R/perf.assess.pls.R @@ -31,6 +31,7 @@ ## -------------------------------- (s)PLS -------------------------------- ## #' @rdname perf.assess +#' @method perf.assess mixo_pls #' @export perf.assess.mixo_pls <- function(object, validation = c("Mfold", "loo"), @@ -41,6 +42,7 @@ perf.assess.mixo_pls <- function(object, seed = NULL, ...) { + # checking args and initialize params ncomp = object$ncomp spls.model <- is(object, 'mixo_spls') @@ -65,7 +67,7 @@ perf.assess.mixo_pls <- function(object, measures <- as.data.frame(measures) # Add this line to remove rows with NAs that correspond to components < ncomp - measures <- dplyr::filter(measures, comp == ncomp) + measures <- dplyr::filter(measures, .data$comp == ncomp) ## R CMD check stuff measure <- feature <- comp <- block <- stability <- value <- NULL @@ -132,6 +134,7 @@ perf.assess.mixo_pls <- function(object, } #' @rdname perf.assess +#' @method perf.assess mixo_spls #' @export perf.assess.mixo_spls <- perf.assess.mixo_pls @@ -145,6 +148,10 @@ perf.assess.mixo_spls <- perf.assess.mixo_pls { # changes to bypass the loop for the Q2 + + ## R CMD check stuff + measure <- feature <- comp <- block <- stability <- value <- NULL + lower <- upper <- keepX <- keepY <- NULL ## -------- checks -------- ## if (object$mode == 'invariant') diff --git a/R/perf.assess.plsda.R b/R/perf.assess.plsda.R index 4485e9c1..fa477a2e 100644 --- a/R/perf.assess.plsda.R +++ b/R/perf.assess.plsda.R @@ -31,7 +31,7 @@ ## ------------------------------- (s)PLSDA ------------------------------- ## #' @rdname perf.assess -#' @importFrom methods hasArg +#' @method perf.assess mixo_plsda #' @export perf.assess.mixo_plsda <- function(object, dist = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"), @@ -328,5 +328,6 @@ perf.assess.mixo_plsda <- function(object, } #' @rdname perf.assess +#' @method perf.assess mixo_splsda #' @export perf.assess.mixo_splsda <- perf.assess.mixo_plsda diff --git a/R/tune.R b/R/tune.R index 13bb1dc9..7a100dff 100644 --- a/R/tune.R +++ b/R/tune.R @@ -82,8 +82,6 @@ #' threshold required for improvement in error rate of the components. Default #' to 0.01. #' @param V Matrix used in the logratio transformation id provided (for tune.pca) -#' @param plot logical argument indicating whether an image map should be -#' plotted by calling the \code{imgCV} function. (for tune.rcc) #' @param BPPARAM A \linkS4class{BiocParallelParam} object indicating the type #' of parallelisation. See examples. #' @param seed set a number here if you want the function to give reproducible outputs. @@ -165,202 +163,158 @@ #' @export #' @example ./examples/tune-examples.R tune <- - function (method = c("spls", "splsda", "block.splsda", "mint.splsda", "rcc", "pca", "spca"), + function (method = c("pls", "spls", "plsda", "splsda", "block.plsda", "block.splsda", "mint.plsda", "mint.splsda", "rcc", "pca", "spca"), # Input data X, Y, + # Sparse params test.keepX = c(5, 10, 15), test.keepY = NULL, already.tested.X, already.tested.Y, # Number of components ncomp, - # PLS mode - mode = c("regression", "canonical", "invariant", "classic"), - # How to do the cross-validation + + ## Params related to model building + V, # PCA + center = TRUE, # PCA + grid1 = seq(0.001, 1, length = 5), # CCA + grid2 = seq(0.001, 1, length = 5), # CCA + mode = c("regression", "canonical", "invariant", "classic"), # PLS + indY, # block PLSDA + weighted = TRUE, # block PLSDA + design, # block PLSDA + study, # MINT PLSDA + tol = 1e-09, # PLSDA, block PLSDA, mint PLSDA, PCA + scale = TRUE, # PLS, PLSDA, block PLSDA, mint PLSDA, PCA + logratio = c('none','CLR'), # PLS, PLSDA, PCA + near.zero.var = FALSE, # PLS, PLSDA, block PLSDA, mint PLSDA + max.iter = 100, # PLS, PLSDA, block PLSDA, mint PLSDA, PCA + multilevel = NULL, # PLS, PLSDA, PCA + + ## Params related to cross-validation + validation = "Mfold", nrepeat = 1, folds = 10, - validation = "Mfold", - max.iter = 100, - tol = 1e-09, signif.threshold = 0.01, - # How to transform data - logratio = c('none','CLR'), - V, - center = TRUE, - scale = TRUE, - near.zero.var = FALSE, - # How to measure accuracy + + # How to measure accuracy - only for DA methods dist = "max.dist", measure = ifelse(method == "spls", "cor", "BER"), - # Multilevel - multilevel = NULL, + auc = FALSE, + # Running params seed = NULL, BPPARAM = SerialParam(), progressBar = FALSE, + # Output params - auc = FALSE, - light.output = TRUE, - plot = FALSE, - # Multiblock specific params - indY, - weighted = TRUE, - design, - # CCA specific params - grid1 = seq(0.001, 1, length = 5), - grid2 = seq(0.001, 1, length = 5), - # MINT specific params - study + light.output = TRUE + ) { + + ## ----------- checks ----------- ## + method = match.arg(method) mode <- match.arg(mode) # hardcode scheme and init scheme = "horst" init = "svd.single" - - if (method == "mint.splsda") { - message("Calling 'tune.mint.splsda' with Leave-One-Group-Out Cross Validation (nrepeat = 1)") - - if (missing(ncomp)) - ncomp = 1 - - result = tune.mint.splsda(X = X, Y = Y, - ncomp = ncomp, - study = study, - test.keepX = test.keepX, - already.tested.X = already.tested.X, - dist = dist, - measure = measure, - auc = auc, - progressBar = progressBar, - scale = scale, - tol = tol, - max.iter = max.iter, - near.zero.var = near.zero.var, - light.output = light.output) - - } else if (method == "block.splsda") { - message("Calling 'tune.block.splsda'") - - result = tune.block.splsda(X = X, - Y = Y, - ncomp = ncomp, - test.keepX = test.keepX, - already.tested.X = already.tested.X, - validation = validation, - folds = folds, - dist = dist, - measure = measure, - progressBar = progressBar, - scale = scale, - tol = tol, - max.iter = max.iter, - near.zero.var = near.zero.var, - light.output = light.output, - nrepeat = nrepeat, - BPPARAM = BPPARAM, - indY = indY, - weighted = weighted, - design = design, - scheme = scheme, - init = init, - signif.threshold = signif.threshold, - seed = seed - ) - - - } else if (method == "rcc") { - message("Calling 'tune.rcc'") - - result = tune.rcc(X = X, - Y = Y, - grid1 = grid1, - grid2 = grid2, - validation = validation, - folds = folds, - plot = plot, - BPPARAM = BPPARAM, - seed = seed) - - } else if (method == "pca") { + + + ## ----------- PCA ----------- ## + + if (method == "pca") { message("Calling 'tune.pca'") if (missing(ncomp)) ncomp = NULL - result = tune.pca(X = X, - ncomp = ncomp, - center = center, - scale = scale, - max.iter = max.iter, - tol = tol) - - + result = tune.pca( + # model building params + X = X, ncomp = ncomp, + center = center, scale = scale, max.iter = max.iter, tol = tol) + + ## ----------- sPCA ----------- ## + } else if (method == "spca") { message("Calling 'tune.spca'") if (missing(ncomp)) ncomp = 2 #TODO use match.call() arg matching for these function calls - result = tune.spca(X = X, - ncomp = ncomp, - nrepeat = nrepeat, - folds = folds, - test.keepX = test.keepX, - center = center, - scale = scale, - BPPARAM = BPPARAM, - seed = seed) - - - } else if (method == "splsda") { - - message("Calling 'tune.splsda'") - - if (missing(ncomp)) - ncomp = 1 + result = tune.spca( + # model building params + X = X, ncomp = ncomp, + center = center, scale = scale, + # sparsity params + test.keepX = test.keepX, + # CV params + nrepeat = nrepeat, folds = folds, + # running params + BPPARAM = BPPARAM, seed = seed) + + + ## ----------- CCA ----------- ## + + } else if (method == "rcc") { + message("Calling 'tune.rcc'") - result = tune.splsda (X = X, Y = Y, - ncomp = ncomp, - test.keepX = test.keepX, - already.tested.X = already.tested.X, - validation = validation, - folds = folds, - dist = dist , - measure = measure, - auc = auc, - progressBar = progressBar, - max.iter = max.iter, - near.zero.var = near.zero.var, - nrepeat = nrepeat, - logratio = logratio, - multilevel = multilevel, - light.output = light.output, - seed = seed) + result = tune.rcc( + # model building params + X = X, Y = Y, + # sparsity params + grid1 = grid1, grid2 = grid2, + # CV params + validation = validation, folds = folds, + # running params + BPPARAM = BPPARAM, seed = seed) + + ## ----------- PLS ----------- ## + + } else if (method == "pls") { + message("Calling 'tune.pls'") + result = tune.pls( + # model building params + X = X, Y = Y, ncomp = ncomp, mode = mode, + scale = scale, logratio = logratio, tol = tol, + max.iter = max.iter, near.zero.var = near.zero.var, + # CV params + validation = validation, nrepeat = nrepeat, folds = folds, + # PA params + measure = measure, + # running params + progressBar = progressBar, + BPPARAM = BPPARAM, seed = seed) + + # multilevel?? + + ## ----------- sPLS +/- multilevel ----------- ## + } else if (method == "spls") { if(missing(multilevel)) { message("Calling 'tune.spls'") - result = tune.spls(X = X, - Y = Y, - test.keepX = test.keepX, - test.keepY = test.keepY, - ncomp = ncomp, - validation = validation, - nrepeat = nrepeat, - folds = folds, - mode = mode, - measure = measure, - BPPARAM = BPPARAM, - progressBar = progressBar, - seed = seed - ) - } else { + result = tune.spls( + # model building params + X = X, Y = Y, ncomp = ncomp, mode = mode, + scale = scale, logratio = logratio, tol = tol, + max.iter = max.iter, near.zero.var = near.zero.var, + # sparsity params + test.keepX = test.keepX, test.keepY = test.keepY, + # CV params + validation = validation, nrepeat = nrepeat, folds = folds, + # PA params + measure = measure, + # running params + progressBar = progressBar, + BPPARAM = BPPARAM, seed = seed) + + } else { message("Calling 'tune.splslevel' with method = 'spls'") if (missing(ncomp)) @@ -370,15 +324,144 @@ tune <- if (missing(already.tested.Y)) already.tested.Y = NULL - result = tune.splslevel(X = X, Y = Y, - multilevel = multilevel, - mode = mode, - ncomp = ncomp, test.keepX = test.keepX, test.keepY = test.keepY, - already.tested.X = already.tested.X, already.tested.Y = already.tested.Y, - BPPARAM = BPPARAM, seed = seed) + result = tune.splslevel( + # model building params + X = X, Y = Y, ncomp = ncomp, mode = mode, + # multilevel params + multilevel = multilevel, + # sparsity params + test.keepX = test.keepX, test.keepY = test.keepY, + already.tested.X = already.tested.X, already.tested.Y = already.tested.Y, + # running params + BPPARAM = BPPARAM, seed = seed) } - } + + ## ----------- plsda ----------- ## + + } else if (method == "plsda") { + + message("Calling 'tune.plsda'") + + if (missing(ncomp)) + ncomp = 1 + + result = tune.plsda( + # model building params + X = X, Y = Y, ncomp = ncomp, multilevel = multilevel, + scale = scale, logratio = logratio, tol = tol, + max.iter = max.iter, near.zero.var = near.zero.var, + # CV params + validation = validation, folds = folds, nrepeat = nrepeat, + # PA params + dist = dist, auc = auc, + # running params + progressBar = progressBar, light.output = light.output, + BPPARAM = BPPARAM, seed = seed) + + ## ----------- splsda ----------- ## + + } else if (method == "splsda") { + + message("Calling 'tune.splsda'") + + if (missing(ncomp)) + ncomp = 1 + + result = tune.splsda( + # model building params + X = X, Y = Y, ncomp = ncomp, multilevel = multilevel, + scale = scale, logratio = logratio, tol = tol, + max.iter = max.iter, near.zero.var = near.zero.var, + # sparsity params + test.keepX = test.keepX, already.tested.X = already.tested.X, + # CV params + validation = validation, folds = folds, nrepeat = nrepeat, + # PA params + dist = dist, measure = measure, auc = auc, + # running params + progressBar = progressBar, light.output = light.output, + BPPARAM = BPPARAM, seed = seed) + + + ## ----------- block.plsda ----------- ## + + } else if (method == "block.plsda") { + message("Calling 'tune.block.plsda'") + + result = tune.block.plsda( + # model building params + X = X, Y = Y, indY = indY, ncomp = ncomp, design = design, + tol = tol, scale = scale, max.iter = max.iter, near.zero.var = near.zero.var, + # CV params + validation = validation, folds = folds, nrepeat = nrepeat, signif.threshold = signif.threshold, + # PA params + dist = dist, weighted = weighted, + # running params + progressBar = progressBar, light.output = light.output, + BPPARAM = BPPARAM, seed = seed) + + + ## ----------- block.splsda ----------- ## + + } else if (method == "block.splsda") { + message("Calling 'tune.block.splsda'") + + result = tune.block.splsda( + # model building params + X = X, Y = Y, indY = indY, ncomp = ncomp, design = design, + tol = tol, scale = scale, max.iter = max.iter, near.zero.var = near.zero.var, + # sparsity params + test.keepX = test.keepX, already.tested.X = already.tested.X, + # CV params + validation = validation, folds = folds, nrepeat = nrepeat, signif.threshold = signif.threshold, + # PA params + dist = dist, weighted = weighted, measure = measure, + # running params + progressBar = progressBar, light.output = light.output, + BPPARAM = BPPARAM, seed = seed) + + ## ----------- mint.plsda ----------- ## + + } else if (method == "mint.plsda") { + message("Calling 'tune.mint.plsda' with Leave-One-Group-Out Cross Validation (nrepeat = 1)") + + if (missing(ncomp)) + ncomp = 1 + + result = tune.mint.plsda( + # model building params + X = X, Y = Y, ncomp = ncomp, study = study, + scale = scale, tol = tol, max.iter = max.iter, near.zero.var = near.zero.var, + # PA params + dist = dist, auc = auc, + # running params - no need for seed or BPPARAM due to LOO CV + progressBar = progressBar,light.output = light.output) + + + ## ----------- mint.splsda ----------- ## + + } else if (method == "mint.splsda") { + message("Calling 'tune.mint.splsda' with Leave-One-Group-Out Cross Validation (nrepeat = 1)") + + if (missing(ncomp)) + ncomp = 1 + + result = tune.mint.splsda( + # model building params + X = X, Y = Y, ncomp = ncomp, study = study, + scale = scale, tol = tol, max.iter = max.iter, near.zero.var = near.zero.var, + # sparsity params + test.keepX = test.keepX, already.tested.X = already.tested.X, + # PA params + dist = dist, measure = measure, auc = auc, + # running params - no need for seed or BPPARAM due to LOO CV + progressBar = progressBar,light.output = light.output) + + } + + ## ----------- end ----------- ## + result$call = match.call() return(result) } diff --git a/R/tune.block.plsda.R b/R/tune.block.plsda.R new file mode 100644 index 00000000..80998fd2 --- /dev/null +++ b/R/tune.block.plsda.R @@ -0,0 +1,234 @@ +# ======================================================================================================== +# tune.block.plsda: Tuning hyperparameters on a block plsda method +# ======================================================================================================== +#' Tuning function for block.plsda method (N-integration with Discriminant Analysis) +#' +#' Computes M-fold or Leave-One-Out Cross-Validation scores based on a +#' user-input grid to determine the optimal parameters for +#' method \code{block.plsda}. +#' +#' This tuning function should be used to tune the number of components in the \code{block.plsda} function (N-integration with Discriminant Analysis). +#' +#' M-fold or LOO cross-validation is performed with stratified subsampling +#' where all classes are represented in each fold. +#' +#' If \code{validation = "Mfold"}, M-fold cross-validation is performed. The +#' number of folds to generate is to be specified in the argument \code{folds}. +#' +#' If \code{validation = "loo"}, leave-one-out cross-validation is performed. +#' By default \code{folds} is set to the number of unique individuals. +#' +#' More details about the prediction distances in \code{?predict} and the +#' supplemental material of the mixOmics article (Rohart et al. 2017). Details +#' about the PLS modes are in \code{?pls}. +#' +#' BER is appropriate in case of an unbalanced number of samples per class as +#' it calculates the average proportion of wrongly classified samples in each +#' class, weighted by the number of samples in each class. BER is less biased +#' towards majority classes during the performance assessment. +#' +#' @inheritParams block.plsda +#' @inheritParams tune +#' @param dist Distance metric. Should be a subset of "max.dist", "centroids.dist", "mahalanobis.dist" or "all". +#' Default is "all" +#' @param weighted tune using either the performance of the Majority vote or +#' the Weighted vote. +#' @param signif.threshold numeric between 0 and 1 indicating the significance +#' threshold required for improvement in error rate of the components. Default +#' to 0.01. +#' @param ... Optional arguments: +#' \itemize{ +#' \item \bold{seed} Integer. Seed number for reproducible parallel code. +#' Default is \code{NULL}. +#' } +#' run in parallel when repeating the cross-validation, which is usually the +#' most computationally intensive process. If there is excess CPU, the +#' cross-vaidation is also parallelised on *nix-based OS which support +#' \code{mclapply}. +#' Note that the argument 'scheme' has now been hardcoded to 'horst' and 'init' to 'svd.single'. +#' @return returns: +#' \item{error.rate}{Prediction error rate for each block of \code{object$X} +#' and each \code{dist}} \item{error.rate.per.class}{Prediction error rate for +#' each block of \code{object$X}, each \code{dist} and each class} +#' \item{predict}{Predicted values of each sample for each class, each block +#' and each component} \item{class}{Predicted class of each sample for each +#' block, each \code{dist}, each component and each nrepeat} \item{features}{a +#' list of features selected across the folds (\code{$stable.X} and +#' \code{$stable.Y}) for the \code{keepX} and \code{keepY} parameters from the +#' input object.} \item{AveragedPredict.class}{if more than one block, returns +#' the average predicted class over the blocks (averaged of the \code{Predict} +#' output and prediction using the \code{max.dist} distance)} +#' \item{AveragedPredict.error.rate}{if more than one block, returns the +#' average predicted error rate over the blocks (using the +#' \code{AveragedPredict.class} output)} \item{WeightedPredict.class}{if more +#' than one block, returns the weighted predicted class over the blocks +#' (weighted average of the \code{Predict} output and prediction using the +#' \code{max.dist} distance). See details for more info on weights.} +#' \item{WeightedPredict.error.rate}{if more than one block, returns the +#' weighted average predicted error rate over the blocks (using the +#' \code{WeightedPredict.class} output.)} \item{MajorityVote}{if more than one +#' block, returns the majority class over the blocks. NA for a sample means that +#' there is no consensus on the predicted class for this particular sample over +#' the blocks.} \item{MajorityVote.error.rate}{if more than one block, returns +#' the error rate of the \code{MajorityVote} output} +#' \item{WeightedVote}{if more than one block, returns the weighted majority +#' class over the blocks. NA for a sample means that there is no consensus on +#' the predicted class for this particular sample over the blocks.} +#' \item{WeightedVote.error.rate}{if more than one block, returns the error +#' rate of the \code{WeightedVote} output} \item{weights}{Returns the weights +#' of each block used for the weighted predictions, for each nrepeat and each +#' fold} \item{choice.ncomp}{For supervised models; returns the optimal number +#' of components for the model for each prediction distance using one-sided +#' t-tests that test for a significant difference in the mean error rate (gain +#' in prediction) when components are added to the model. See more details in +#' Rohart et al 2017 Suppl. For more than one block, an optimal ncomp is +#' returned for each prediction framework.} +#' +#' @author Florian Rohart, Amrit Singh, Kim-Anh Lê Cao, AL J Abadi +#' @seealso \code{\link{block.splsda}} and http://www.mixOmics.org for more +#' details. +#' @references Method: +#' +#' Singh A., Gautier B., Shannon C., Vacher M., Rohart F., Tebbutt S. and Lê +#' Cao K.A. (2016). DIABLO: multi omics integration for biomarker discovery. +#' +#' mixOmics article: +#' +#' Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics +#' feature selection and multiple data integration. PLoS Comput Biol 13(11): +#' e1005752 +#' @keywords regression multivariate +#' @export +#' @example ./examples/tune.block.plsda-examples.R +tune.block.plsda <- + function ( + # basic params + X, + Y, + indY, + ncomp = 2, + # model building params + tol = 1e-06, + max.iter = 100, + near.zero.var = FALSE, + design, + scale = TRUE, + # cross validation params + validation = "Mfold", + folds = 10, + nrepeat = 1, + signif.threshold=0.01, + # measure of performance params + dist = "all", + weighted = TRUE, # optimise the weighted or not-weighted prediction + # processing params + progressBar = FALSE, + light.output = TRUE, # if FALSE, output the prediction and classification of each sample during each folds, on each comp, for each repeat + BPPARAM = SerialParam(), + seed = NULL, + ...) + { + if (hasArg('cpus')) #defunct + { + stop("'cpus' has been replaced by BPPARAM. See documentation.") + } + BPPARAM$RNGseed <- seed + set.seed(seed) + + # hardcode init and scheme + scheme <- 'horst' + init <- 'svd.single' + + ## ----------- checks ----------- + + # check input 'Y' and transformation in a dummy matrix + if (!missing(Y)) + { + if (is.null(dim(Y))) + { + Y = factor(Y) + } else { + stop("'Y' should be a factor or a class vector.") + } + + if (nlevels(Y) == 1) + stop("'Y' should be a factor with more than one level") + + } else if (!missing(indY)) { + Y = X[[indY]] + if (is.null(dim(Y))) + { + Y = factor(Y) + } else { + stop("'Y' should be a factor or a class vector.") + } + + if (nlevels(Y) == 1) + stop("'X[[indY]]' should be a factor with more than one level") + + X = X[-indY] #remove Y from X to pass the arguments simpler to block.splsda + + } else if (missing(indY)) { + stop("Either 'Y' or 'indY' is needed") + + } + ## check using internal #TODO we need to unify the checks + Y.check <- unmap(Y) + Y.check <- matrix(Y.check, nrow = nrow(Y.check), dimnames = list(rownames(X[[1]]), NULL)) + Check.entry.wrapper.mint.block(X = X, Y = Y.check, indY = indY, + ncomp = ncomp, DA=TRUE, + design = design, init = init, scheme = scheme, scale = scale, + near.zero.var = near.zero.var, mode = 'regression', tol = tol, + max.iter = max.iter) + + ## ensure all X blocks are matrices, keeping dimnames + X <- lapply(X, function(z){ + zm <- z + if (!is.matrix(zm)) { + zm <- as.matrix(zm) + dimnames(zm) <- dimnames(z) + } + return(zm) + }) + + #-- progressBar + if (!is.logical(progressBar)) + stop("'progressBar' must be a logical constant (TRUE or FALSE).", + call. = FALSE) + + #-- ncomp + if (is.null(ncomp) || !is.numeric(ncomp) || ncomp <= 0) + stop("invalid number of variates, 'ncomp'.") + + + #-- validation + choices = c("Mfold", "loo") + validation = choices[pmatch(validation, choices)] + if (is.na(validation)) + stop("'validation' must be either 'Mfold' or 'loo'") + + if (validation == 'loo') + { + if (nrepeat != 1) + message("Leave-One-Out validation does not need to be repeated: 'nrepeat' is set to '1'.") + nrepeat = 1 + } + + #-- check significance threshold + signif.threshold <- .check_alpha(signif.threshold) + + #-- validation + if (any(is.na(validation)) || length(validation) > 1) + stop("'validation' should be one of 'Mfold' or 'loo'.", call. = FALSE) + + ## ----------- Cross-validation ----------- + + block.plsda_res <- block.plsda(X, Y, ncomp = ncomp, + scale = scale, tol = tol, max.iter = max.iter, near.zero.var = near.zero.var, design = design) + perf_res <- perf(block.plsda_res, + validation = validation, folds = folds, nrepeat = nrepeat, + dist = dist, + BPPARAM = BPPARAM, seed = seed, progressBar = progressBar) + return(perf_res) + +} \ No newline at end of file diff --git a/R/tune.block.splsda.R b/R/tune.block.splsda.R index 41661650..1b522c58 100644 --- a/R/tune.block.splsda.R +++ b/R/tune.block.splsda.R @@ -1,15 +1,15 @@ # ======================================================================================================== -# tune.splsda: chose the optimal number of parameters per component on a splsda method +# tune.block.splsda: Tuning hyperparameters on a block splsda method # ======================================================================================================== #' Tuning function for block.splsda method (N-integration with sparse #' Discriminant Analysis) #' #' Computes M-fold or Leave-One-Out Cross-Validation scores based on a -#' user-input grid to determine the optimal parsity parameters values for +#' user-input grid to determine the optimal parameters for #' method \code{block.splsda}. #' -#' This tuning function should be used to tune the keepX parameters in the -#' \code{block.splsda} function (N-integration with sparse Discriminant +#' This tuning function should be used to tune the number of components and the +#' keepX parameters in the \code{block.splsda} function (N-integration with sparse Discriminant #' Analysis). #' #' M-fold or LOO cross-validation is performed with stratified subsampling @@ -39,26 +39,21 @@ #' @param test.keepX A named list with the same length and names as X #' (without the outcome Y, if it is provided in X and designated using #' \code{indY}). Each entry of this list is a numeric vector for the different -#' keepX values to test for that specific block. +#' keepX values to test for that specific block. If set to NULL, ncomp is tuned. #' @param already.tested.X Optional, if \code{ncomp > 1} A named list of #' numeric vectors each of length \code{n_tested} indicating the number of #' variables to select from the \eqn{X} data set on the first \code{n_tested} #' components. +#' @param measure only used when \code{test.keepX} is not NULL. Measure used when plotting, +#' should be 'BER' or 'overall' +#' @param dist distance metric to estimate the classification error rate, should be one of +#' "centroids.dist", "mahalanobis.dist" or "max.dist" (see Details). If \code{test.keepX = NULL}, +#' can also input "all" or more than one distance metric #' @param weighted tune using either the performance of the Majority vote or #' the Weighted vote. #' @param signif.threshold numeric between 0 and 1 indicating the significance #' threshold required for improvement in error rate of the components. Default #' to 0.01. -#' @param ... Optional arguments: -#' \itemize{ -#' \item \bold{seed} Integer. Seed number for reproducible parallel code. -#' Default is \code{NULL}. -#' } -#' run in parallel when repeating the cross-validation, which is usually the -#' most computationally intensive process. If there is excess CPU, the -#' cross-vaidation is also parallelised on *nix-based OS which support -#' \code{mclapply}. -#' Note that the argument 'scheme' has now been hardcoded to 'horst' and 'init' to 'svd.single'. #' @return A list that contains: \item{error.rate}{returns the prediction error #' for each \code{test.keepX} on each component, averaged across all repeats #' and subsampling folds. Standard deviation is also output. All error rates @@ -76,6 +71,45 @@ #' #' \item{cor.value}{compute the correlation between latent variables for #' two-factor sPLS-DA analysis.} +#' +#' If \code{test.keepX = NULL}, returns: +#' \item{error.rate}{Prediction error rate for each block of \code{object$X} +#' and each \code{dist}} \item{error.rate.per.class}{Prediction error rate for +#' each block of \code{object$X}, each \code{dist} and each class} +#' \item{predict}{Predicted values of each sample for each class, each block +#' and each component} \item{class}{Predicted class of each sample for each +#' block, each \code{dist}, each component and each nrepeat} \item{features}{a +#' list of features selected across the folds (\code{$stable.X} and +#' \code{$stable.Y}) for the \code{keepX} and \code{keepY} parameters from the +#' input object.} \item{AveragedPredict.class}{if more than one block, returns +#' the average predicted class over the blocks (averaged of the \code{Predict} +#' output and prediction using the \code{max.dist} distance)} +#' \item{AveragedPredict.error.rate}{if more than one block, returns the +#' average predicted error rate over the blocks (using the +#' \code{AveragedPredict.class} output)} \item{WeightedPredict.class}{if more +#' than one block, returns the weighted predicted class over the blocks +#' (weighted average of the \code{Predict} output and prediction using the +#' \code{max.dist} distance). See details for more info on weights.} +#' \item{WeightedPredict.error.rate}{if more than one block, returns the +#' weighted average predicted error rate over the blocks (using the +#' \code{WeightedPredict.class} output.)} \item{MajorityVote}{if more than one +#' block, returns the majority class over the blocks. NA for a sample means that +#' there is no consensus on the predicted class for this particular sample over +#' the blocks.} \item{MajorityVote.error.rate}{if more than one block, returns +#' the error rate of the \code{MajorityVote} output} +#' \item{WeightedVote}{if more than one block, returns the weighted majority +#' class over the blocks. NA for a sample means that there is no consensus on +#' the predicted class for this particular sample over the blocks.} +#' \item{WeightedVote.error.rate}{if more than one block, returns the error +#' rate of the \code{WeightedVote} output} \item{weights}{Returns the weights +#' of each block used for the weighted predictions, for each nrepeat and each +#' fold} \item{choice.ncomp}{For supervised models; returns the optimal number +#' of components for the model for each prediction distance using one-sided +#' t-tests that test for a significant difference in the mean error rate (gain +#' in prediction) when components are added to the model. See more details in +#' Rohart et al 2017 Suppl. For more than one block, an optimal ncomp is +#' returned for each prediction framework.} +#' #' @author Florian Rohart, Amrit Singh, Kim-Anh Lê Cao, AL J Abadi #' @seealso \code{\link{block.splsda}} and http://www.mixOmics.org for more #' details. @@ -93,32 +127,35 @@ #' @export #' @example ./examples/tune.block.splsda-examples.R tune.block.splsda <- - function (X, + function ( + # basic params + X, Y, indY, ncomp = 2, - test.keepX, - already.tested.X, - validation = "Mfold", - folds = 10, - dist = "max.dist", - measure = "BER", - # one of c("overall","BER") - weighted = TRUE, - # optimise the weighted or not-weighted prediction - progressBar = FALSE, + # model building params tol = 1e-06, max.iter = 100, near.zero.var = FALSE, - nrepeat = 1, design, scale = TRUE, - light.output = TRUE, - # if FALSE, output the prediction and classification of each sample during each folds, on each comp, for each repeat + # sparse method params + test.keepX, + already.tested.X, + # cross validation params + validation = "Mfold", + folds = 10, + nrepeat = 1, signif.threshold=0.01, + # measure of performance params + dist = "max.dist", + measure = "BER", # one of c("overall","BER") + weighted = TRUE, # optimise the weighted or not-weighted prediction + # processing params + progressBar = FALSE, + light.output = TRUE, # if FALSE, output the prediction and classification of each sample during each folds, on each comp, for each repeat BPPARAM = SerialParam(), - seed = NULL, - ...) + seed = NULL) { if (hasArg('cpus')) #defunct { @@ -182,13 +219,6 @@ tune.block.splsda <- } return(zm) }) - - #-- dist - dist = match.arg( - dist, - choices = c("max.dist", "centroids.dist", "mahalanobis.dist"), - several.ok = TRUE - ) #-- progressBar if (!is.logical(progressBar)) @@ -199,7 +229,6 @@ tune.block.splsda <- if (is.null(ncomp) || !is.numeric(ncomp) || ncomp <= 0) stop("invalid number of variates, 'ncomp'.") - #-- validation choices = c("Mfold", "loo") validation = choices[pmatch(validation, choices)] @@ -222,7 +251,6 @@ tune.block.splsda <- signif.threshold <- .check_alpha(signif.threshold) #-- already.tested.X - if (missing(already.tested.X)) { already.tested.X = NULL @@ -248,262 +276,283 @@ tune.block.splsda <- ) } + #-- validation if (any(is.na(validation)) || length(validation) > 1) stop("'validation' should be one of 'Mfold' or 'loo'.", call. = FALSE) #-- test.keepX - if (missing(test.keepX)) - { - test.keepX = lapply(X, function(x) { - max.test.keepX <- min(30, ncol(x)) - if (max.test.keepX > 15) - return(seq(5, max.test.keepX, 5)) - else - return(seq(1, max.test.keepX, 2)) - }) - + if (is.null(test.keepX)){ + print("test.keepX is set to NULL, tuning only for number of components...") + block.splsda_res <- block.splsda(X, Y, ncomp = ncomp, + scale = scale, tol = tol, max.iter = max.iter, near.zero.var = near.zero.var, design = design) + perf_res <- perf(block.splsda_res, + validation = validation, folds = folds, nrepeat = nrepeat, + dist = dist, + BPPARAM = BPPARAM, seed = seed, progressBar = progressBar) + return(perf_res) + } else { - if (length(test.keepX) != length(X)) - stop( - paste( - "test.keepX should be a list of length ", - length(X), - ", corresponding to the blocks: ", - paste(names(X), collapse = ", "), - sep = "" + + #-- dist (for tune can only have one dist, for perf can have multiple) + dist = match.arg( + dist, + choices = c("max.dist", "centroids.dist", "mahalanobis.dist"), + several.ok = TRUE + ) + + if (missing(test.keepX)) + { + test.keepX = lapply(X, function(x) { + max.test.keepX <- min(30, ncol(x)) + if (max.test.keepX > 15) + return(seq(5, max.test.keepX, 5)) + else + return(seq(1, max.test.keepX, 2)) + }) + + } else { + if (length(test.keepX) != length(X)) + stop( + paste( + "test.keepX should be a list of length ", + length(X), + ", corresponding to the blocks: ", + paste(names(X), collapse = ", "), + sep = "" + ) ) - ) + + #aa = sapply(test.keepX, length) + #if (any(is.null(aa) | aa == 1 | !is.numeric(aa))) + #stop("Each entry of 'test.keepX' must be a numeric vector with more than two values", call. = FALSE) + + } - #aa = sapply(test.keepX, length) - #if (any(is.null(aa) | aa == 1 | !is.numeric(aa))) - #stop("Each entry of 'test.keepX' must be a numeric vector with more than two values", call. = FALSE) + l = sapply(test.keepX, length) + n = names(test.keepX) + temp = data.frame(l, n) - } - - l = sapply(test.keepX, length) - n = names(test.keepX) - temp = data.frame(l, n) - - - message( - paste( - "\nYou have provided a sequence of keepX of length: ", - paste(apply(temp, 1, function(x) - paste(x, collapse = " for block ")), collapse = " and "), - ".\nThis results in ", - prod(sapply(test.keepX, length)), - " models being fitted for each component and each nrepeat, this may take some time to run, be patient!", - sep = "" + + message( + paste( + "\nYou have provided a sequence of keepX of length: ", + paste(apply(temp, 1, function(x) + paste(x, collapse = " for block ")), collapse = " and "), + ".\nThis results in ", + prod(sapply(test.keepX, length)), + " models being fitted for each component and each nrepeat, this may take some time to run, be patient!", + sep = "" + ) ) - ) - - if (is (BPPARAM, 'SerialParam')) - { - message(paste0( - "\nYou can look into the 'BPPARAM' argument to speed up computation time." - )) - } else { - if (progressBar == TRUE) + if (is (BPPARAM, 'SerialParam')) + { message(paste0( - "\nAs code is running in parallel, the progressBar is not available." + "\nYou can look into the 'BPPARAM' argument to speed up computation time." )) - } - - ## ----------- END checks -----------# - - ## ----------- NA calculation ----------- - - misdata = c(sapply(X, anyNA), Y = FALSE) # Detection of missing data. we assume no missing values in the factor Y - - is.na.A = vector("list", length = length(X)) - for (q in seq_along(X)) - { - if (misdata[q]) + + } else { + if (progressBar == TRUE) + message(paste0( + "\nAs code is running in parallel, the progressBar is not available." + )) + } + + ## ----------- END checks -----------# + + ## ----------- NA calculation ----------- + + misdata = c(sapply(X, anyNA), Y = FALSE) # Detection of missing data. we assume no missing values in the factor Y + + is.na.A = vector("list", length = length(X)) + for (q in seq_along(X)) { - is.na.A[[q]] = is.na(X[[q]]) - #ind.NA[[q]] = which(apply(is.na.A[[q]], 1, sum) > 0) # calculated only once - #ind.NA.col[[q]] = which(apply(is.na.A[[q]], 2, sum) >0) # indice of the col that have missing values. used in the deflation + if (misdata[q]) + { + is.na.A[[q]] = is.na(X[[q]]) + #ind.NA[[q]] = which(apply(is.na.A[[q]], 1, sum) > 0) # calculated only once + #ind.NA.col[[q]] = which(apply(is.na.A[[q]], 2, sum) >0) # indice of the col that have missing values. used in the deflation + } } - } - - ## ----------- END NA calculation ----------- # - - - # if some components have already been tuned (eg comp1 and comp2), we're only tuning the following ones (comp3 comp4 .. ncomp) - if ((!is.null(already.tested.X)) & length(already.tested.X) > 0) - { - comp.real = (length(already.tested.X[[1]]) + 1):ncomp - #check and match already.tested.X to X - if (length(already.tested.X[[1]]) > 0) + + ## ----------- END NA calculation ----------- # + + + # if some components have already been tuned (eg comp1 and comp2), we're only tuning the following ones (comp3 comp4 .. ncomp) + if ((!is.null(already.tested.X)) & length(already.tested.X) > 0) { - if (length(unique(names(already.tested.X))) != length(already.tested.X) | - sum(is.na(match(names( - already.tested.X - ), names(X)))) > 0) - stop( - "Each entry of 'already.tested.X' must have a unique name corresponding to a block of 'X'" - ) + comp.real = (length(already.tested.X[[1]]) + 1):ncomp + #check and match already.tested.X to X + if (length(already.tested.X[[1]]) > 0) + { + if (length(unique(names(already.tested.X))) != length(already.tested.X) | + sum(is.na(match(names( + already.tested.X + ), names(X)))) > 0) + stop( + "Each entry of 'already.tested.X' must have a unique name corresponding to a block of 'X'" + ) + + } + } else { + comp.real = seq_len(ncomp) } - } else { - comp.real = seq_len(ncomp) - } - - # near zero var on the whole data sets. It will be performed inside each fold as well - if (near.zero.var == TRUE) - { - nzv.A = lapply(X, nearZeroVar) - for (q in seq_along(X)) + # near zero var on the whole data sets. It will be performed inside each fold as well + if (near.zero.var == TRUE) { - if (length(nzv.A[[q]]$Position) > 0) + nzv.A = lapply(X, nearZeroVar) + for (q in seq_along(X)) { - names.remove.X = colnames(X[[q]])[nzv.A[[q]]$Position] - X[[q]] = X[[q]][, -nzv.A[[q]]$Position, drop = FALSE] - warning( - "Zero- or near-zero variance predictors.\n Reset predictors matrix to not near-zero variance predictors.\n See $nzv for problematic predictors." - ) - if (ncol(X[[q]]) == 0) - stop(paste0("No more variables in", X[[q]])) + if (length(nzv.A[[q]]$Position) > 0) + { + names.remove.X = colnames(X[[q]])[nzv.A[[q]]$Position] + X[[q]] = X[[q]][, -nzv.A[[q]]$Position, drop = FALSE] + warning( + "Zero- or near-zero variance predictors.\n Reset predictors matrix to not near-zero variance predictors.\n See $nzv for problematic predictors." + ) + if (ncol(X[[q]]) == 0) + stop(paste0("No more variables in", X[[q]])) + + #need to check that the keepA[[q]] is now not higher than ncol(A[[q]]) + if (any(test.keepX[[q]] > ncol(X[[q]]))) + test.keepX[[q]][which(test.keepX[[q]] > ncol(X[[q]]))] = ncol(X[[q]]) + } - #need to check that the keepA[[q]] is now not higher than ncol(A[[q]]) - if (any(test.keepX[[q]] > ncol(X[[q]]))) - test.keepX[[q]][which(test.keepX[[q]] > ncol(X[[q]]))] = ncol(X[[q]]) } - } - } - N.test.keepX = nrow(expand.grid(test.keepX)) - - mat.error.rate = list() - - mat.sd.error = matrix(0, - nrow = N.test.keepX, - ncol = ncomp - length(already.tested.X[[1]])) - - mat.mean.error = matrix(nrow = N.test.keepX, + N.test.keepX = nrow(expand.grid(test.keepX)) + + mat.error.rate = list() + + mat.sd.error = matrix(0, + nrow = N.test.keepX, ncol = ncomp - length(already.tested.X[[1]])) - - - mat.error.rate = list() - error.per.class.keepX.opt = list() - error.per.class.keepX.opt.mean = matrix( - 0, - nrow = nlevels(Y), - ncol = length(comp.real), - dimnames = list(c(levels(Y)), c(paste0('comp', comp.real))) - ) - - error.opt.per.comp = matrix( - nrow = nrepeat, - ncol = length(comp.real), - dimnames = list(paste("nrep", seq_len(nrepeat), sep = "."), paste0("comp", comp.real)) - ) - - if (light.output == FALSE) - class.all = list() - - ## ----------- tune components ----------- - - # successively tune the components until ncomp: comp1, then comp2, ... - for (comp in seq_along(comp.real)) - { - tune_comp <- comp.real[comp] - if (progressBar == TRUE) - cat(sprintf("\ntuning component %s\n", tune_comp)) - - result = MCVfold.block.splsda( - X, - Y, - validation = validation, - folds = folds, - nrepeat = nrepeat, - ncomp = tune_comp, - choice.keepX = already.tested.X, - scheme = scheme, - design = design, - init = init, - tol = tol, - test.keepX = test.keepX, - measure = measure, - dist = dist, - scale = scale, - weighted = weighted, - near.zero.var = near.zero.var, - progressBar = progressBar, - max.iter = max.iter, - misdata = misdata, - is.na.A = is.na.A, - BPPARAM = BPPARAM + + mat.mean.error = matrix(nrow = N.test.keepX, + ncol = ncomp - length(already.tested.X[[1]])) + + + mat.error.rate = list() + error.per.class.keepX.opt = list() + error.per.class.keepX.opt.mean = matrix( + 0, + nrow = nlevels(Y), + ncol = length(comp.real), + dimnames = list(c(levels(Y)), c(paste0('comp', comp.real))) ) + error.opt.per.comp = matrix( + nrow = nrepeat, + ncol = length(comp.real), + dimnames = list(paste("nrep", seq_len(nrepeat), sep = "."), paste0("comp", comp.real)) + ) - ## returns error.rate for all test.keepX + if (light.output == FALSE) + class.all = list() - # in the following, there is [[1]] because 'tune' is working with only 1 distance and 'MCVfold.block.splsda' can work with multiple distances - mat.error.rate[[comp]] = result[[measure]]$mat.error.rate[[1]] - mat.mean.error[, comp] = result[[measure]]$error.rate.mean[[1]] - if (!is.null(result[[measure]]$error.rate.sd[[1]])) - mat.sd.error[, comp] = result[[measure]]$error.rate.sd[[1]] + ## ----------- tune components ----------- - # confusion matrix for keepX.opt - error.per.class.keepX.opt[[comp]] = result[[measure]]$confusion[[1]] - error.per.class.keepX.opt.mean[, comp] = apply(result[[measure]]$confusion[[1]], 1, mean) + # successively tune the components until ncomp: comp1, then comp2, ... + for (comp in seq_along(comp.real)) + { + tune_comp <- comp.real[comp] + if (progressBar == TRUE) + cat(sprintf("\ntuning component %s\n", tune_comp)) + + result = MCVfold.block.splsda( + X, + Y, + validation = validation, + folds = folds, + nrepeat = nrepeat, + ncomp = tune_comp, + choice.keepX = already.tested.X, + scheme = scheme, + design = design, + init = init, + tol = tol, + test.keepX = test.keepX, + measure = measure, + dist = dist, + scale = scale, + weighted = weighted, + near.zero.var = near.zero.var, + progressBar = progressBar, + max.iter = max.iter, + misdata = misdata, + is.na.A = is.na.A, + BPPARAM = BPPARAM + ) + + + ## returns error.rate for all test.keepX + + # in the following, there is [[1]] because 'tune' is working with only 1 distance and 'MCVfold.block.splsda' can work with multiple distances + mat.error.rate[[comp]] = result[[measure]]$mat.error.rate[[1]] + mat.mean.error[, comp] = result[[measure]]$error.rate.mean[[1]] + if (!is.null(result[[measure]]$error.rate.sd[[1]])) + mat.sd.error[, comp] = result[[measure]]$error.rate.sd[[1]] + + # confusion matrix for keepX.opt + error.per.class.keepX.opt[[comp]] = result[[measure]]$confusion[[1]] + error.per.class.keepX.opt.mean[, comp] = apply(result[[measure]]$confusion[[1]], 1, mean) + + # error rate for best keepX + error.opt.per.comp[, comp] = mat.error.rate[[comp]][result[[measure]]$ind.keepX.opt[[1]], ] + + # best keepX + already.tested.X = result[[measure]]$choice.keepX + + if (light.output == FALSE) + { + #prediction of each samples for each fold and each repeat, on each comp + class.all[[comp]] = result$class.comp[[1]] + } + } - # error rate for best keepX - error.opt.per.comp[, comp] = mat.error.rate[[comp]][result[[measure]]$ind.keepX.opt[[1]], ] + ## ----------- END tune components ----------- # - # best keepX - already.tested.X = result[[measure]]$choice.keepX + ## ----------- output ----------- - if (light.output == FALSE) + rownames(mat.mean.error) = rownames(result[[measure]]$mat.error.rate[[1]]) + colnames(mat.mean.error) = paste0("comp", comp.real) + names(mat.error.rate) = c(paste0("comp", comp.real)) + names(error.per.class.keepX.opt) = c(paste0("comp", comp.real)) + if (nrepeat > 1) { - #prediction of each samples for each fold and each repeat, on each comp - class.all[[comp]] = result$class.comp[[1]] + rownames(mat.sd.error) = rownames(result[[measure]]$mat.error.rate[[1]]) + colnames(mat.sd.error) = paste0("comp", comp.real) } + + + # calculating the number of optimal component based on t.tests and the error.rate.all, if more than 3 error.rates(repeat>3) + if (nrepeat > 2 & length(comp.real) > 1) + { + error.keepX = error.opt.per.comp + opt = t.test.process(error.opt.per.comp, alpha = signif.threshold) + ncomp_opt = comp.real[opt] + } else { + ncomp_opt = error.keepX = NULL + } + + + result = list( + error.rate = mat.mean.error, + error.rate.sd = mat.sd.error, + error.rate.all = mat.error.rate, + choice.keepX = already.tested.X, + choice.ncomp = list(ncomp = ncomp_opt, values = error.keepX), + error.rate.class = error.per.class.keepX.opt + ) + + result$measure = measure.input + result$call = match.call() + + class(result) = "tune.block.splsda" + + return(result) + } - - ## ----------- END tune components ----------- # - - ## ----------- output ----------- - - rownames(mat.mean.error) = rownames(result[[measure]]$mat.error.rate[[1]]) - colnames(mat.mean.error) = paste0("comp", comp.real) - names(mat.error.rate) = c(paste0("comp", comp.real)) - names(error.per.class.keepX.opt) = c(paste0("comp", comp.real)) - if (nrepeat > 1) - { - rownames(mat.sd.error) = rownames(result[[measure]]$mat.error.rate[[1]]) - colnames(mat.sd.error) = paste0("comp", comp.real) - } - - - # calculating the number of optimal component based on t.tests and the error.rate.all, if more than 3 error.rates(repeat>3) - if (nrepeat > 2 & length(comp.real) > 1) - { - error.keepX = error.opt.per.comp - opt = t.test.process(error.opt.per.comp, alpha = signif.threshold) - ncomp_opt = comp.real[opt] - } else { - ncomp_opt = error.keepX = NULL - } - - - result = list( - error.rate = mat.mean.error, - error.rate.sd = mat.sd.error, - error.rate.all = mat.error.rate, - choice.keepX = already.tested.X, - choice.ncomp = list(ncomp = ncomp_opt, values = error.keepX), - error.rate.class = error.per.class.keepX.opt - ) - - result$measure = measure.input - result$call = match.call() - - class(result) = "tune.block.splsda" - - return(result) - - } + } \ No newline at end of file diff --git a/R/tune.mint.plsda.R b/R/tune.mint.plsda.R new file mode 100644 index 00000000..28dca587 --- /dev/null +++ b/R/tune.mint.plsda.R @@ -0,0 +1,190 @@ +# ======================================================================================================== +# tune.mint.plsda: chose the optimal number of parameters per component on a mint.plsda method +# ======================================================================================================== +#' Estimate the parameters of mint.plsda method +#' +#' Computes Leave-One-Group-Out-Cross-Validation (LOGOCV) scores on a +#' user-input grid to determine optimal values for the parameters in +#' \code{mint.plsda}. +#' +#' This function performs a Leave-One-Group-Out-Cross-Validation (LOGOCV), +#' where each of \code{study} is left out once. +#' +#' The function outputs the optimal number of components that achieve the best +#' performance based on the overall error rate or BER. The assessment is +#' data-driven and similar to the process detailed in (Rohart et al., 2016), +#' where one-sided t-tests assess whether there is a gain in performance when +#' adding a component to the model. Our experience has shown that in most case, +#' the optimal number of components is the number of categories in \code{Y} - +#' 1, but it is worth tuning a few extra components to check (see our website +#' and case studies for more details). +#' +#' BER is appropriate in case of an unbalanced number of samples per class as +#' it calculates the average proportion of wrongly classified samples in each +#' class, weighted by the number of samples in each class. BER is less biased +#' towards majority classes during the performance assessment. +#' +#' More details about the prediction distances in \code{?predict} and the +#' supplemental material of the mixOmics article (Rohart et al. 2017). +#' +#' @param X numeric matrix of predictors. \code{NA}s are allowed. +#' @param Y Outcome. Numeric vector or matrix of responses (for multi-response +#' models) +#' @param ncomp Number of components to include in the model (see Details). +#' Default to 1 +#' @param study grouping factor indicating which samples are from the same +#' study +#' @param dist only applies to an object inheriting from \code{"plsda"} or +#' \code{"splsda"} to evaluate the classification performance of the model. +#' Should be a subset of \code{"max.dist"}, \code{"centroids.dist"}, +#' \code{"mahalanobis.dist"}. Default is \code{"all"}. See +#' \code{\link{predict}}. +#' @param auc if \code{TRUE} calculate the Area Under the Curve (AUC) +#' performance of the model. +#' @param progressBar by default set to \code{TRUE} to output the progress bar +#' of the computation. +#' @param scale Logical. If scale = TRUE, each block is standardized to zero +#' means and unit variances (default: TRUE) +#' @param tol Convergence stopping value. +#' @param max.iter integer, the maximum number of iterations. +#' @param near.zero.var Logical, see the internal \code{\link{nearZeroVar}} +#' function (should be set to TRUE in particular for data with many zero +#' values). Default value is FALSE +#' @param light.output if set to FALSE, the prediction/classification of each +#' sample for each of \code{test.keepX} and each comp is returned. +#' @param signif.threshold numeric between 0 and 1 indicating the significance +#' threshold required for improvement in error rate of the components. Default +#' to 0.01. +#' @return The returned value is a list with components: +#' \item{study.specific.error}{A list that gives BER, overall error rate and +#' error rate per class, for each study} \item{global.error}{A list that gives +#' BER, overall error rate and error rate per class for all samples} +#' \item{predict}{A list of length \code{ncomp} that produces the predicted +#' values of each sample for each class} \item{class}{A list which gives the +#' predicted class of each sample for each \code{dist} and each of the +#' \code{ncomp} components. Directly obtained from the \code{predict} output.} +#' \item{auc}{AUC values} \item{auc.study}{AUC values for each study in mint +#' models}. +#' +#' @author Florian Rohart, Al J Abadi +#' @seealso \code{\link{mint.plsda}} and http://www.mixOmics.org for more +#' details. +#' @references Rohart F, Eslami A, Matigian, N, Bougeard S, Lê Cao K-A (2017). +#' MINT: A multivariate integrative approach to identify a reproducible +#' biomarker signature across multiple experiments and platforms. BMC +#' Bioinformatics 18:128. +#' +#' mixOmics article: +#' +#' Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics +#' feature selection and multiple data integration. PLoS Comput Biol 13(11): +#' e1005752 +#' @keywords multivariate dplot +#' @export +#' @example ./examples/tune.mint.plsda-examples.R + +tune.mint.plsda <- + function (X, + Y, + ncomp = 1, + study, + # model building params + scale = TRUE, + tol = 1e-06, + max.iter = 100, + near.zero.var = FALSE, + # CV params + signif.threshold = 0.01, + # PA params + dist = c("max.dist", "centroids.dist", "mahalanobis.dist"), + auc = FALSE, + # running params + progressBar = FALSE, + light.output = TRUE # if FALSE, output the prediction and classification of each sample during each folds, on each comp, for each repeat + ) + { + #-- checking general input parameters --------------------------------------# + #---------------------------------------------------------------------------# + + ## R CMD check stuff + BPPARAM <- seed <- NULL + + #------------------# + #-- check entries --# + if(missing(X)) + stop("'X'is missing", call. = FALSE) + + X = as.matrix(X) + + if (length(dim(X)) != 2 || !is.numeric(X)) + stop("'X' must be a numeric matrix.", call. = FALSE) + + + # Testing the input Y + if(missing(Y)) + stop("'Y'is missing", call. = FALSE) + if (is.null(Y)) + stop("'Y' has to be something else than NULL.", call. = FALSE) + + if (is.null(dim(Y))) + { + Y = factor(Y) + } else { + stop("'Y' should be a factor or a class vector.", call. = FALSE) + } + + if (nlevels(Y) == 1) + stop("'Y' should be a factor with more than one level", call. = FALSE) + + #-- check significance threshold + signif.threshold <- .check_alpha(signif.threshold) + + #-- progressBar + if (!is.logical(progressBar)) + stop("'progressBar' must be a logical constant (TRUE or FALSE).", call. = FALSE) + + + if (is.null(ncomp) || !is.numeric(ncomp) || ncomp <= 0) + stop("invalid number of variates, 'ncomp'.") + + # -- check using the check of mint.splsda + Y.mat = unmap(Y) + colnames(Y.mat) = levels(Y) + + check = Check.entry.pls(X, Y = Y.mat, ncomp = ncomp, mode="regression", scale=scale, + near.zero.var=near.zero.var, max.iter=max.iter ,tol=tol ,logratio="none" ,DA=TRUE, multilevel=NULL) + X = check$X + ncomp = check$ncomp + + # -- study + #set the default study factor + if (missing(study)) + stop("'study' is missing", call. = FALSE) + + if (length(study) != nrow(X)) + stop(paste0("'study' must be a factor of length ",nrow(X),".")) + + if (any(table(study) <= 1)) + stop("At least one study has only one sample, please consider removing before calling the function again", call. = FALSE) + if (any(table(study) < 5)) + warning("At least one study has less than 5 samples, mean centering might not do as expected") + + if(sum(apply(table(Y,study)!=0,2,sum)==1) >0) + stop("At least one study only contains a single level of the multi-levels outcome Y. The MINT algorithm cannot be computed.") + + if(sum(apply(table(Y,study)==0,2,sum)>0) >0) + warning("At least one study does not contain all the levels of the outcome Y. The MINT algorithm might not perform as expected.") + + #-- light.output + if (!is.logical(light.output)) + stop("'light.output' must be either TRUE or FALSE", call. = FALSE) + + #-- run perf to tune ncomp --------------------------------------# + #---------------------------------------------------------------------------# + mint.plsda_res <- mint.plsda(X, Y, ncomp = ncomp, study = study, + scale = scale, tol = tol, max.iter = max.iter, near.zero.var = near.zero.var) + perf_res <- perf(mint.plsda_res, + dist = dist, + BPPARAM = BPPARAM, seed = seed, progressBar = progressBar) + return(perf_res) + } \ No newline at end of file diff --git a/R/tune.mint.splsda.R b/R/tune.mint.splsda.R index 26316c0c..5b97f0a1 100644 --- a/R/tune.mint.splsda.R +++ b/R/tune.mint.splsda.R @@ -4,17 +4,14 @@ #' Estimate the parameters of mint.splsda method #' #' Computes Leave-One-Group-Out-Cross-Validation (LOGOCV) scores on a -#' user-input grid to determine optimal values for the sparsity parameters in +#' user-input grid to determine optimal values for the parameters in #' \code{mint.splsda}. #' #' This function performs a Leave-One-Group-Out-Cross-Validation (LOGOCV), -#' where each of \code{study} is left out once. It returns a list of variables -#' of \code{X} that were selected on each of the \code{ncomp} components. Then, -#' a \code{\link{mint.splsda}} can be performed with \code{keepX} set as the -#' output \code{choice.keepX}. +#' where each of \code{study} is left out once. #' -#' All component \eqn{1:\code{ncomp}} are tuned, except the first ones for -#' which a \code{already.tested.X} is provided. See examples below. +#' When \code{test.keepX} is not NULL, all component \eqn{1:\code{ncomp}} are tuned to identify number of variables to keep, +#' except the first ones for which a \code{already.tested.X} is provided. See examples below. #' #' The function outputs the optimal number of components that achieve the best #' performance based on the overall error rate or BER. The assessment is @@ -41,7 +38,7 @@ #' @param study grouping factor indicating which samples are from the same #' study #' @param test.keepX numeric vector for the different number of variables to -#' test from the \eqn{X} data set +#' test from the \eqn{X} data set. If set to NULL only number of components will be tuned. #' @param already.tested.X if \code{ncomp > 1} Numeric vector indicating the #' number of variables to select from the \eqn{X} data set on the firsts #' components @@ -51,7 +48,8 @@ #' \code{"mahalanobis.dist"}. Default is \code{"all"}. See #' \code{\link{predict}}. #' @param measure Two misclassification measure are available: overall -#' misclassification error \code{overall} or the Balanced Error Rate \code{BER} +#' misclassification error \code{overall} or the Balanced Error Rate \code{BER}. +#' Only used when \code{test.keepX = NULL}. #' @param auc if \code{TRUE} calculate the Area Under the Curve (AUC) #' performance of the model. #' @param progressBar by default set to \code{TRUE} to output the progress bar @@ -81,6 +79,18 @@ #' \item{predict}{Prediction values for each sample, each \code{test.keepX} and #' each comp.} \item{class}{Predicted class for each sample, each #' \code{test.keepX} and each comp.} +#' +#' If \code{test.keepX = NULL}, returns: +#' \item{study.specific.error}{A list that gives BER, overall error rate and +#' error rate per class, for each study} \item{global.error}{A list that gives +#' BER, overall error rate and error rate per class for all samples} +#' \item{predict}{A list of length \code{ncomp} that produces the predicted +#' values of each sample for each class} \item{class}{A list which gives the +#' predicted class of each sample for each \code{dist} and each of the +#' \code{ncomp} components. Directly obtained from the \code{predict} output.} +#' \item{auc}{AUC values} \item{auc.study}{AUC values for each study in mint +#' models}. +#' #' @author Florian Rohart, Al J Abadi #' @seealso \code{\link{mint.splsda}} and http://www.mixOmics.org for more #' details. @@ -96,53 +106,38 @@ #' e1005752 #' @keywords multivariate dplot #' @export -#' @examples -#' data(stemcells) -#' data = stemcells$gene -#' type.id = stemcells$celltype -#' exp = stemcells$study -#' -#' res = mint.splsda(X=data,Y=type.id,ncomp=3,keepX=c(10,5,15),study=exp) -#' out = tune.mint.splsda(X=data,Y=type.id,ncomp=2,near.zero.var=FALSE, -#' study=exp,test.keepX=seq(1,10,1)) -#' -#' out$choice.ncomp -#' out$choice.keepX -#' -#' \dontrun{ -#' -#' out = tune.mint.splsda(X=data,Y=type.id,ncomp=2,near.zero.var=FALSE, -#' study=exp,test.keepX=seq(1,10,1)) -#' out$choice.keepX -#' -#' ## only tune component 2 and keeping 10 genes on comp1 -#' out = tune.mint.splsda(X=data,Y=type.id,ncomp=2, study=exp, -#' already.tested.X = c(10), -#' test.keepX=seq(1,10,1)) -#' out$choice.keepX -#' -#' } +#' @example ./examples/tune.mint.splsda-examples.R + tune.mint.splsda <- function (X, Y, ncomp = 1, study, - test.keepX = c(5, 10, 15), + # sparsity params + test.keepX = NULL, already.tested.X, - dist = c("max.dist", "centroids.dist", "mahalanobis.dist"), - measure = c("BER", "overall"), - auc = FALSE, - progressBar = FALSE, + # model building params scale = TRUE, tol = 1e-06, max.iter = 100, near.zero.var = FALSE, - light.output = TRUE, # if FALSE, output the prediction and classification of each sample during each folds, on each comp, for each repeat - signif.threshold=0.01 + # CV params + signif.threshold = 0.01, + # PA params + dist = c("max.dist", "centroids.dist", "mahalanobis.dist"), + measure = c("BER", "overall"), + auc = FALSE, + # running params + progressBar = FALSE, + light.output = TRUE # if FALSE, output the prediction and classification of each sample during each folds, on each comp, for each repeat + ) { #-- checking general input parameters --------------------------------------# #---------------------------------------------------------------------------# + + ## R CMD check stuff + BPPARAM <- seed <- NULL #------------------# #-- check entries --# @@ -183,9 +178,6 @@ tune.mint.splsda <- stop("invalid number of variates, 'ncomp'.") - #-- measure - measure <- match.arg(measure) - #if ((!is.null(already.tested.X)) && (length(already.tested.X) != (ncomp - 1)) ) #stop("The number of already tested parameters should be NULL or ", ncomp - 1, " since you set ncomp = ", ncomp) if (missing(already.tested.X)) @@ -230,17 +222,33 @@ tune.mint.splsda <- if(sum(apply(table(Y,study)==0,2,sum)>0) >0) warning("At least one study does not contain all the levels of the outcome Y. The MINT algorithm might not perform as expected.") - - #-- dist - dist = match.arg(dist) - #-- light.output if (!is.logical(light.output)) stop("'light.output' must be either TRUE or FALSE", call. = FALSE) - #-- test.keepX + #-- test.keepX, if null just tune on ncomp + if (is.null(test.keepX)){ + print("test.keepX is set to NULL, tuning only for number of components...") + mint.splsda_res <- mint.splsda(X, Y, ncomp = ncomp, study = study, + scale = scale, tol = tol, max.iter = max.iter, near.zero.var = near.zero.var) + perf_res <- perf(mint.splsda_res, + dist = dist, + BPPARAM = BPPARAM, seed = seed, progressBar = progressBar) + return(perf_res) + + } else { if (is.null(test.keepX) | length(test.keepX) == 1 | !is.numeric(test.keepX)) stop("'test.keepX' must be a numeric vector with more than two entries", call. = FALSE) + + #-- dist (for tune can only have one dist, for perf can have multiple) + dist = match.arg( + dist, + choices = c("max.dist", "centroids.dist", "mahalanobis.dist"), + several.ok = TRUE + ) + + #-- measure + measure <- match.arg(measure) #-- end checking --# #------------------# @@ -362,4 +370,5 @@ tune.mint.splsda <- class(result) = c("tune.mint.splsda","tune.splsda") return(result) + } } diff --git a/R/tune.pca.R b/R/tune.pca.R index 449daa36..abc2f985 100644 --- a/R/tune.pca.R +++ b/R/tune.pca.R @@ -52,17 +52,14 @@ #' for more details. #' @keywords algebra #' @export -#' @examples -#' data(liver.toxicity) -#' tune <- tune.pca(liver.toxicity$gene, center = TRUE, scale = TRUE) -#' tune -#' plot(tune) +#' @example ./examples/tune.pca-examples.R + tune.pca <- function(X, ncomp = NULL, center = TRUE, # sets the mean of the data to zero, ensures that the first PC describes the direction of the maximum variance - scale = FALSE, # variance is unit across different units - max.iter = 500, + scale = TRUE, # variance is unit across different units + max.iter = 100, tol = 1e-09, logratio = c('none','CLR','ILR'), V = NULL, diff --git a/R/tune.pls.R b/R/tune.pls.R new file mode 100644 index 00000000..b882bb98 --- /dev/null +++ b/R/tune.pls.R @@ -0,0 +1,193 @@ +############################################################################################################# +# Authors: +# Florian Rohart, The University of Queensland, The University of Queensland Diamantina Institute, Translational Research Institute, Brisbane, QLD +# Kim-Anh Le Cao, The University of Queensland, The University of Queensland Diamantina Institute, Translational Research Institute, Brisbane, QLD +# Benoit Gautier, The University of Queensland, The University of Queensland Diamantina Institute, Translational Research Institute, Brisbane, QLD +# Francois Bartolo, Institut National des Sciences Appliquees et Institut de Mathematiques, Universite de Toulouse et CNRS (UMR 5219), France +# +# created: 2013 +# last modified: 05-10-2017 +# +# Copyright (C) 2013 +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +############################################################################################################# + + +# ======================================================================================================== +# tune.pls: Tuning hyperparameters on a pls method +# ======================================================================================================== +#' +#' Tuning functions for PLS method +#' +#' Computes M-fold or Leave-One-Out Cross-Validation scores on a user-input +#' grid to determine optimal values for the parameters in \code{spls}. +#' +#' +#' This tuning function should be used to tune the number of components to select for spls models. +#' +#' +#' @template section/folds +#' @template section/nrepeat +#' @template section/measure-pls +#' @template section/t-test-process +#' +#' @section more: +#' See also \code{?perf} for more details. +#' +#' @param X numeric matrix of predictors with the rows as individual observations. +#' @param Y numeric matrix of response(s) with the rows as individual observations matching \code{X}. +#' @template arg/ncomp +#' @template arg/validation +#' @template arg/folds +#' @template arg/nrepeat +#' @param measure The tuning measure to use. Cannot be NULL when applied to sPLS1 object. See details. +#' @templateVar modes \code{"regression"}, \code{"canonical"}, \code{"invariant"} or \code{"classic"} +#' @template arg/mode +#' @param scale Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE +#' @param tol Positive numeric used as convergence criteria/tolerance during the iterative process. Default to 1e-06. +#' @param max.iter Integer, the maximum number of iterations. Default to 100. +#' @param near.zero.var Logical, see the internal nearZeroVar function (should be set to TRUE in particular for data with many zero values). Setting this argument to FALSE (when appropriate) will speed up the computations. Default value is FALSE. +#' @param logratio Character, one of ('none','CLR') specifies the log ratio transformation to deal with compositional values that may arise from specific normalisation in sequencing data. Default to 'none'. See ?logratio.transfo for details. +#' @param multilevel Numeric, design matrix for repeated measurement analysis, where multilevel decomposition is required. For a one factor decomposition, the repeated measures on each individual, i.e. the individuals ID is input as the first column. For a 2 level factor decomposition then 2nd AND 3rd columns indicate those factors. See examples. +#' @template arg/progressBar +#' @template arg/BPPARAM +#' @param seed set a number here if you want the function to give reproducible outputs. +#' Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'. +#' @param ... Optional parameters passed to \code{\link{pls}} +#' @return +#' Returns a list with the following components for every repeat: +#' \item{MSEP}{Mean Square Error Prediction for each \eqn{Y} variable, only +#' applies to object inherited from \code{"pls"}, and \code{"spls"}. Only +#' available when in regression (s)PLS.} +#' \item{RMSEP}{Root Mean Square Error Prediction for each \eqn{Y} variable, only +#' applies to object inherited from \code{"pls"}, and \code{"spls"}. Only +#' available when in regression (s)PLS.} +#' \item{R2}{a matrix of \eqn{R^2} values of the \eqn{Y}-variables for models +#' with \eqn{1, \ldots ,}\code{ncomp} components, only applies to object +#' inherited from \code{"pls"}, and \code{"spls"}. Only available when in +#' regression (s)PLS.} +#' \item{Q2}{if \eqn{Y} contains one variable, a vector of \eqn{Q^2} values +#' else a list with a matrix of \eqn{Q^2} values for each \eqn{Y}-variable. +#' Note that in the specific case of an sPLS model, it is better to have a look +#' at the Q2.total criterion, only applies to object inherited from +#' \code{"pls"}, and \code{"spls"}. Only available when in regression (s)PLS.} +#' \item{Q2.total}{a vector of \eqn{Q^2}-total values for models with \eqn{1, +#' \ldots ,}\code{ncomp} components, only applies to object inherited from +#' \code{"pls"}, and \code{"spls"}. Available in both (s)PLS modes.} +#' \item{RSS}{Residual Sum of Squares across all selected features and the +#' components.} +#' \item{PRESS}{Predicted Residual Error Sum of Squares across all selected +#' features and the components.} +#' \item{features}{a list of features selected across the +#' folds (\code{$stable.X} and \code{$stable.Y}) for the \code{keepX} and +#' \code{keepY} parameters from the input object. Note, this will be \code{NULL} +#' if using standard (non-sparse) PLS.} +#' \item{cor.tpred, cor.upred}{Correlation between the +#' predicted and actual components for X (t) and Y (u)} +#' \item{RSS.tpred, RSS.upred}{Residual Sum of Squares between the +#' predicted and actual components for X (t) and Y (u)} +#' +#' @author Kim-Anh Lê Cao, Al J Abadi, Benoit Gautier, Francois Bartolo and Florian Rohart. +#' @seealso \code{\link{splsda}}, \code{\link{predict.splsda}}, and http://www.mixOmics.org for more details. +#' @references mixOmics article: +#' +#' Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics +#' feature selection and multiple data integration. PLoS Comput Biol 13(11): +#' e1005752 +#' +#' PLS and PLS citeria for PLS regression: Tenenhaus, M. (1998). La regression +#' PLS: theorie et pratique. Paris: Editions Technic. +#' +#' Chavent, Marie and Patouille, Brigitte (2003). Calcul des coefficients de +#' regression et du PRESS en regression PLS1. Modulad n, 30 1-11. (this is the +#' formula we use to calculate the Q2 in perf.pls and perf.spls) +#' +#' Mevik, B.-H., Cederkvist, H. R. (2004). Mean Squared Error of Prediction +#' (MSEP) Estimates for Principal Component Regression (PCR) and Partial Least +#' Squares Regression (PLSR). Journal of Chemometrics 18(9), 422-429. +#' +#' Sparse PLS regression mode: +#' +#' Lê Cao, K. A., Rossouw D., Robert-Granie, C. and Besse, P. (2008). A sparse +#' PLS for variable selection when integrating Omics data. Statistical +#' Applications in Genetics and Molecular Biology 7, article 35. +#' +#' One-sided t-tests (suppl material): +#' +#' Rohart F, Mason EA, Matigian N, Mosbergen R, Korn O, Chen T, Butcher S, +#' Patel J, Atkinson K, Khosrotehrani K, Fisk NM, Lê Cao K-A&, Wells CA& +#' (2016). A Molecular Classification of Human Mesenchymal Stromal Cells. PeerJ +#' 4:e1845. +#' +#' @keywords regression multivariate +#' @export +#' @example ./examples/tune.pls-examples.R +#' +tune.pls <- + function(X, + Y, + ncomp, + # params related to CV + validation = c('Mfold', 'loo'), + nrepeat = 1, + folds, + measure = NULL, + # params related to spls model building + mode = c('regression', 'canonical', 'classic'), + scale = TRUE, + logratio = "none", + tol = 1e-06, + max.iter = 100, + near.zero.var = FALSE, + multilevel = NULL, + # params related to running tune + BPPARAM = SerialParam(), + seed = NULL, + progressBar = FALSE, + ... + ) { + + #-- checking general input parameters --------------------------------------# + #---------------------------------------------------------------------------# + out = list() + mode <- match.arg(mode) + + BPPARAM$RNGseed <- seed + + # hardcode to streamline + limQ2 <- 0.0975 + + X <- .check_numeric_matrix(X, block_name = 'X') + Y <- .check_numeric_matrix(Y, block_name = 'Y') + + check_cv <- .check_cv_args(validation = validation, + nrepeat = nrepeat, folds = folds, + N = nrow(X)) + validation <- check_cv$validation + nrepeat <- check_cv$nrepeat + folds <- check_cv$folds + + #-- build model and run perf() --------------------------------------# + #---------------------------------------------------------------------------# + pls_res <- pls(X, Y, ncomp, + mode = mode, scale = scale, logratio = logratio, tol = tol, max.iter = max.iter, near.zero.var = near.zero.var, multilevel = multilevel) + perf_res <- perf(pls_res, + validation = validation, folds = folds, nrepeat = nrepeat, + dist = dist, + BPPARAM = BPPARAM, seed = seed, progressBar = progressBar) + return(perf_res) + + } \ No newline at end of file diff --git a/R/tune.plsda.R b/R/tune.plsda.R new file mode 100644 index 00000000..25504f88 --- /dev/null +++ b/R/tune.plsda.R @@ -0,0 +1,243 @@ +# ======================================================================================================== +# tune.plsda: tuning hyperparameters on a plsda method +# ======================================================================================================== + +#' Tuning functions for PLS-DA method +#' +#' Computes M-fold or Leave-One-Out Cross-Validation scores on a user-input +#' grid to determine optimal values for the parameters in \code{plsda}. +#' +#' +#' This tuning function should be used to tune the parameters in the +#' \code{plsda} function (number of components and distance metric to select). +#' +#' For a PLS-DA, M-fold or LOO cross-validation is performed with stratified +#' subsampling where all classes are represented in each fold. +#' +#' If \code{validation = "loo"}, leave-one-out cross-validation is performed. +#' By default \code{folds} is set to the number of unique individuals. +#' +#' The function outputs the optimal number of components that achieve the best +#' performance based on the overall error rate or BER. The assessment is +#' data-driven and similar to the process detailed in (Rohart et al., 2016), +#' where one-sided t-tests assess whether there is a gain in performance when +#' adding a component to the model. Our experience has shown that in most case, +#' the optimal number of components is the number of categories in \code{Y} - +#' 1, but it is worth tuning a few extra components to check (see our website +#' and case studies for more details). +#' +#' For PLS-DA multilevel one-factor analysis, M-fold or LOO cross-validation +#' is performed where all repeated measurements of one sample are in the same +#' fold. Note that logratio transform and the multilevel analysis are performed +#' internally and independently on the training and test set. +#' +#' For a PLS-DA multilevel two-factor analysis, the correlation between +#' components from the within-subject variation of X and the \code{cond} matrix +#' is computed on the whole data set. The reason why we cannot obtain a +#' cross-validation error rate as for the pls-DA one-factor analysis is +#' because of the difficulty to decompose and predict the within matrices +#' within each fold. +#' +#' For a PLS two-factor analysis a PLS canonical mode is run, and the +#' correlation between components from the within-subject variation of X and Y +#' is computed on the whole data set. +#' +#' If \code{validation = "Mfold"}, M-fold cross-validation is performed. How +#' many folds to generate is selected by specifying the number of folds in +#' \code{folds}. +#' +#' If \code{auc = TRUE} and there are more than 2 categories in \code{Y}, the +#' Area Under the Curve is averaged using one-vs-all comparison. Note however +#' that the AUC criteria may not be particularly insightful as the prediction +#' threshold we use in PLS-DA differs from an AUC threshold (PLS-DA relies on +#' prediction distances for predictions, see \code{?predict.plsda} for more +#' details) and the supplemental material of the mixOmics article (Rohart et +#' al. 2017). +#' +#' BER is appropriate in case of an unbalanced number of samples per class as +#' it calculates the average proportion of wrongly classified samples in each +#' class, weighted by the number of samples in each class. BER is less biased +#' towards majority classes during the performance assessment. +#' +#' More details about the prediction distances in \code{?predict} and the +#' supplemental material of the mixOmics article (Rohart et al. 2017). +#' +#' The tune.plsda() function calls older function perf() to perform this cross-validation, +#' for more details see the perf() help pages. +#' +#' @param X numeric matrix of predictors. \code{NA}s are allowed. +#' @param Y \code{if(method = 'spls')} numeric vector or matrix of continuous +#' responses (for multi-response models) \code{NA}s are allowed. +#' @param ncomp the number of components to include in the model. +#' @param validation character. What kind of (internal) validation to use, +#' matching one of \code{"Mfold"} or \code{"loo"} (short for 'leave-one-out'). +#' Default is \code{"Mfold"}. +#' @param folds the folds in the Mfold cross-validation. See Details. +#' @param dist distance metric to use for \code{splsda} to estimate the +#' classification error rate, should be a subset of \code{"centroids.dist"}, +#' \code{"mahalanobis.dist"} or \code{"max.dist"} (see Details). +#' @param scale Logical. If scale = TRUE, each block is standardized to zero +#' means and unit variances (default: TRUE) +#' @param auc if \code{TRUE} calculate the Area Under the Curve (AUC) +#' @param progressBar by default set to \code{TRUE} to output the progress bar +#' of the computation. +#' @param tol Convergence stopping value. +#' @param max.iter integer, the maximum number of iterations. +#' @param near.zero.var Logical, see the internal \code{\link{nearZeroVar}} +#' function (should be set to TRUE in particular for data with many zero +#' values). Default value is FALSE +#' @param nrepeat Number of times the Cross-Validation process is repeated. +#' @param logratio one of ('none','CLR'). Default to 'none' +#' @param multilevel Design matrix for multilevel analysis (for repeated +#' measurements) that indicates the repeated measures on each individual, i.e. +#' the individuals ID. See Details. +#' @param light.output if set to FALSE, the prediction/classification of each +#' sample for each of \code{test.keepX} and each comp is returned. +#' @param signif.threshold numeric between 0 and 1 indicating the significance +#' threshold required for improvement in error rate of the components. Default +#' to 0.01. +#' @param dist Distance metric. Should be a subset of "max.dist", "centroids.dist", "mahalanobis.dist" or "all". Default is "all" +#' @template arg/BPPARAM +#' @param seed set a number here if you want the function to give reproducible outputs. +#' Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'. +#' @return matrix of classification error rate estimation. +#' The dimensions correspond to the components in the +#' model and to the prediction method used, respectively. +#' +#' \item{auc}{Averaged AUC values +#' over the \code{nrepeat}} +#' +#' \item{cor.value}{only if multilevel analysis with 2 factors: correlation +#' between latent variables.} +#' +#' @author Kim-Anh Lê Cao, Benoit Gautier, Francois Bartolo, Florian Rohart, +#' Al J Abadi +#' @seealso \code{\link{splsda}}, \code{\link{predict.splsda}} and +#' http://www.mixOmics.org for more details. +#' @references mixOmics article: +#' +#' Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics +#' feature selection and multiple data integration. PLoS Comput Biol 13(11): +#' e1005752 +#' @keywords regression multivariate +#' @export +#' @example ./examples/tune.plsda-examples.R + +tune.plsda <- + function (X, Y, + ncomp = 1, + # params related to spls model building + scale = TRUE, + logratio = c('none','CLR'), + max.iter = 100, + tol = 1e-06, + near.zero.var = FALSE, + multilevel = NULL, + # params related to CV + validation = "Mfold", + folds = 10, + nrepeat = 1, + signif.threshold = 0.01, + # params related to PA + dist = "all", + auc = FALSE, + # params related to running + progressBar = FALSE, + light.output = TRUE, + BPPARAM = SerialParam(), + seed = NULL + ) + { #-- checking general input parameters --------------------------------------# + #---------------------------------------------------------------------------# + + BPPARAM$RNGseed <- seed + set.seed(seed) + + #-- check significance threshold + signif.threshold <- .check_alpha(signif.threshold) + + #------------------# + #-- check entries --# + if(!is(X, "matrix")) + X = as.matrix(X) + + if (length(dim(X)) != 2 || !is.numeric(X)) + stop("'X' must be a numeric matrix.") + + + # Testing the input Y + if (is.null(multilevel)) + { + if (is.null(Y)) + stop("'Y' has to be something else than NULL.") + + if (is.null(dim(Y))) + { + Y = factor(Y) + } else { + stop("'Y' should be a factor or a class vector.") + } + + if (nlevels(Y) == 1) + stop("'Y' should be a factor with more than one level") + + } else { + # we expect a vector or a 2-columns matrix in 'Y' and the repeated measurements in 'multilevel' + multilevel = data.frame(multilevel) + + if ((nrow(X) != nrow(multilevel))) + stop("unequal number of rows in 'X' and 'multilevel'.") + + if (ncol(multilevel) != 1) + stop("'multilevel' should have a single column for the repeated measurements, other factors should be included in 'Y'.") + + if (!is.null(ncol(Y)) && !ncol(Y) %in% c(0,1,2))# multilevel 1 or 2 factors + stop("'Y' should either be a factor, a single column data.frame containing a factor, or a 2-columns data.frame containing 2 factors.") + + } + + + #-- progressBar + if (!is.logical(progressBar)) + stop("'progressBar' must be a logical constant (TRUE or FALSE).", call. = FALSE) + + + if (is.null(ncomp) || !is.numeric(ncomp) || ncomp <= 0) + stop("invalid number of variates, 'ncomp'.") + + + #-- validation + choices = c("Mfold", "loo") + validation = choices[pmatch(validation, choices)] + if (is.na(validation)) + stop("'validation' must be either 'Mfold' or 'loo'") + + if (validation == 'loo') + { + if (nrepeat != 1) + warning("Leave-One-Out validation does not need to be repeated: 'nrepeat' is set to '1'.") + nrepeat = 1 + } + + if (nrepeat < 3 & ncomp > 1) + message("Note that the number of components cannot be reliably tuned with nrepeat < 3 or validaion = 'loo'.") + + #-- logratio + logratio <- match.arg(logratio) + + #-- validation + if (any(is.na(validation)) || length(validation) > 1) + stop("'validation' should be one of 'Mfold' or 'loo'.", call. = FALSE) + + + #------------------# + #-- run perf --# + plsda_res <- plsda(X, Y, ncomp, scale = scale, tol = tol, max.iter = max.iter, + near.zero.var = near.zero.var, logratio = logratio, multilevel = multilevel) + perf_res <- perf(plsda_res, + validation = validation, folds = folds, nrepeat = nrepeat, + dist = dist, signif.threshold = signif.threshold, + BPPARAM = BPPARAM, seed = seed, progressBar = progressBar, + auc = auc) + return(perf_res) + } \ No newline at end of file diff --git a/R/tune.rcc.R b/R/tune.rcc.R index a3eaa5c9..2dc5fb77 100644 --- a/R/tune.rcc.R +++ b/R/tune.rcc.R @@ -33,8 +33,6 @@ library(BiocParallel) #' \code{"Mfolds"} (M-folds). See Details. #' @param folds positive integer. Number of folds to use if #' \code{validation="Mfold"}. Defaults to \code{folds=10}. -#' @param plot logical argument indicating whether an image map should be -#' plotted by calling the \code{imgCV} function. #' @param BPPARAM a BiocParallel parameter object; see \code{BiocParallel::bpparam} #' for details. Default is \code{MulticoreParam()} for parallel processing. #' @param seed set a number here if you want the function to give reproducible outputs. @@ -50,14 +48,8 @@ library(BiocParallel) #' details. #' @keywords multivariate dplot #' @export -#' @examples -#' data(nutrimouse) -#' X <- nutrimouse$lipid -#' Y <- nutrimouse$gene -#' -#' ## this can take some seconds -#' -#' tune.rcc(X, Y, validation = "Mfold") +#' @example ./examples/tune.rcc-examples.R + tune.rcc <- function(X, Y, @@ -65,7 +57,6 @@ tune.rcc <- grid2 = seq(0.001, 1, length = 5), validation = c("loo", "Mfold"), folds = 10, - plot = TRUE, BPPARAM = SerialParam(), seed = NULL) { @@ -119,9 +110,6 @@ tune.rcc <- cv.score.grid = cbind(grid, cv.score) mat = matrix(cv.score, nrow = length(grid1), ncol = length(grid2)) - if (isTRUE(plot)) - image.tune.rcc(list(grid1 = grid1, grid2 = grid2, mat = mat)) - opt = cv.score.grid[cv.score.grid[, 3] == max(cv.score.grid[, 3]), ] out = list(opt.lambda1 = opt[[1]], opt.lambda2 = opt[[2]], diff --git a/R/tune.spca.R b/R/tune.spca.R index 70802e1e..7e90873b 100644 --- a/R/tune.spca.R +++ b/R/tune.spca.R @@ -16,8 +16,10 @@ #' it is recommended to use Leave-One-Out Cross-Validation which can be #' achieved by setting \code{folds = nrow(X)}. #' @inheritParams spca -#' @inheritParams tune.splsda #' @param folds Number of folds in 'Mfold' cross-validation. See details. +#' @param nrepeat Number of times the Cross-Validation process is repeated. +#' @param test.keepX numeric vector for the different number of variables to +#' test from the \eqn{X} data set. #' @param BPPARAM A \linkS4class{BiocParallelParam} object indicating the type #' of parallelisation. See examples. #' @param seed set a number here if you want the function to give reproducible outputs. diff --git a/R/tune.spls.R b/R/tune.spls.R index 6d1050c5..652111e5 100644 --- a/R/tune.spls.R +++ b/R/tune.spls.R @@ -27,14 +27,18 @@ # ======================================================================================================== -# tune.spls: chose the optimal number of parameters per component on a spls method +# tune.spls: Tuning hyperparameters on a spls method # ======================================================================================================== -# TODO details on nrepeat -# TODO details on pls vs spls -# TODO tidy outputs preferably in an array -#' Tuning functions for sPLS and PLS functions #' -#' @template description/tune +#' Tuning functions for sPLS method +#' +#' Computes M-fold or Leave-One-Out Cross-Validation scores on a user-input +#' grid to determine optimal values for the parameters in \code{spls}. +#' +#' +#' This tuning function should be used to tune the parameters in the +#' \code{spls} function (number of components and number of variables to select). +#' #' #' @template section/folds #' @template section/nrepeat @@ -49,19 +53,26 @@ #' @template arg/ncomp #' @template arg/test.keepX-X.matrix #' @template arg/test.keepY -#' @templateVar modes \code{"regression"}, \code{"canonical"}, \code{"invariant"} or \code{"classic"} -#' @template arg/mode #' @template arg/validation #' @template arg/folds #' @template arg/nrepeat -#' @param measure The tuning measure to use. See details. +#' @param measure The tuning measure to use. Cannot be NULL when applied to sPLS1 object. See details. +#' @templateVar modes \code{"regression"}, \code{"canonical"}, \code{"invariant"} or \code{"classic"} +#' @template arg/mode +#' @param scale Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE +#' @param tol Positive numeric used as convergence criteria/tolerance during the iterative process. Default to 1e-06. +#' @param max.iter Integer, the maximum number of iterations. Default to 100. +#' @param near.zero.var Logical, see the internal nearZeroVar function (should be set to TRUE in particular for data with many zero values). Setting this argument to FALSE (when appropriate) will speed up the computations. Default value is FALSE. +#' @param logratio Character, one of ('none','CLR') specifies the log ratio transformation to deal with compositional values that may arise from specific normalisation in sequencing data. Default to 'none'. See ?logratio.transfo for details. +#' @param multilevel Numeric, design matrix for repeated measurement analysis, where multilevel decomposition is required. For a one factor decomposition, the repeated measures on each individual, i.e. the individuals ID is input as the first column. For a 2 level factor decomposition then 2nd AND 3rd columns indicate those factors. See examples. #' @template arg/progressBar #' @template arg/BPPARAM #' @param seed set a number here if you want the function to give reproducible outputs. #' Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'. -#' @param limQ2 Q2 threshold for recommending optimal \code{ncomp}. #' @param ... Optional parameters passed to \code{\link{spls}} -#' @return A list that contains: \item{cor.pred}{The correlation of predicted vs +#' @return +#' If \code{test.keepX != NULL} and \code{test.keepY != NULL} returns a list that contains: +#' \item{cor.pred}{The correlation of predicted vs #' actual components from X (t) and Y (u) for each #' component}\item{RSS.pred}{The Residual Sum of Squares of predicted vs #' actual components from X (t) and Y (u) for each component} @@ -71,6 +82,39 @@ #' \item{choice.ncomp}{returns the optimal number of components for the model #' fitted with \code{$choice.keepX} and \code{$choice.keepY} } \item{call}{The #' functioncal call including the parameteres used.} +#' +#' If \code{test.keepX = NULL} and \code{test.keepY = NULL} returns a list with the following components for every repeat: +#' \item{MSEP}{Mean Square Error Prediction for each \eqn{Y} variable, only +#' applies to object inherited from \code{"pls"}, and \code{"spls"}. Only +#' available when in regression (s)PLS.} +#' \item{RMSEP}{Root Mean Square Error Prediction for each \eqn{Y} variable, only +#' applies to object inherited from \code{"pls"}, and \code{"spls"}. Only +#' available when in regression (s)PLS.} +#' \item{R2}{a matrix of \eqn{R^2} values of the \eqn{Y}-variables for models +#' with \eqn{1, \ldots ,}\code{ncomp} components, only applies to object +#' inherited from \code{"pls"}, and \code{"spls"}. Only available when in +#' regression (s)PLS.} +#' \item{Q2}{if \eqn{Y} contains one variable, a vector of \eqn{Q^2} values +#' else a list with a matrix of \eqn{Q^2} values for each \eqn{Y}-variable. +#' Note that in the specific case of an sPLS model, it is better to have a look +#' at the Q2.total criterion, only applies to object inherited from +#' \code{"pls"}, and \code{"spls"}. Only available when in regression (s)PLS.} +#' \item{Q2.total}{a vector of \eqn{Q^2}-total values for models with \eqn{1, +#' \ldots ,}\code{ncomp} components, only applies to object inherited from +#' \code{"pls"}, and \code{"spls"}. Available in both (s)PLS modes.} +#' \item{RSS}{Residual Sum of Squares across all selected features and the +#' components.} +#' \item{PRESS}{Predicted Residual Error Sum of Squares across all selected +#' features and the components.} +#' \item{features}{a list of features selected across the +#' folds (\code{$stable.X} and \code{$stable.Y}) for the \code{keepX} and +#' \code{keepY} parameters from the input object. Note, this will be \code{NULL} +#' if using standard (non-sparse) PLS.} +#' \item{cor.tpred, cor.upred}{Correlation between the +#' predicted and actual components for X (t) and Y (u)} +#' \item{RSS.tpred, RSS.upred}{Residual Sum of Squares between the +#' predicted and actual components for X (t) and Y (u)} +#' #' @author Kim-Anh Lê Cao, Al J Abadi, Benoit Gautier, Francois Bartolo, #' Florian Rohart, #' @seealso \code{\link{splsda}}, \code{\link{predict.splsda}} and @@ -106,23 +150,8 @@ #' 4:e1845. #' @keywords regression multivariate #' @export -#' @examples +#' @example ./examples/tune.spls-examples.R #' -#' \dontrun{ -#' data(liver.toxicity) -#' X <- liver.toxicity$gene -#' Y <- liver.toxicity$clinic -#' set.seed(42) -#' tune.res = tune.spls( X, Y, ncomp = 3, -#' test.keepX = c(5, 10, 15), -#' test.keepY = c(3, 6, 8), measure = "cor", -#' folds = 5, nrepeat = 3, progressBar = TRUE) -#' tune.res$choice.ncomp -#' tune.res$choice.keepX -#' tune.res$choice.keepY -#' # plot the results -#' plot(tune.res) -#' } # TODO change this so that it simply wraps perf tune.spls <- function(X, @@ -130,21 +159,35 @@ tune.spls <- test.keepX = NULL, test.keepY = NULL, ncomp, + # params related to spls model building + mode = c('regression', 'canonical', 'classic'), + scale = TRUE, + logratio = "none", + tol = 1e-09, + max.iter = 100, + near.zero.var = FALSE, + multilevel = NULL, + # params related to CV validation = c('Mfold', 'loo'), nrepeat = 1, folds, - mode = c('regression', 'canonical', 'classic'), measure = NULL, + # params related to running tune BPPARAM = SerialParam(), seed = NULL, progressBar = FALSE, - limQ2 = 0.0975, ... ) { + + #-- checking general input parameters --------------------------------------# + #---------------------------------------------------------------------------# out = list() mode <- match.arg(mode) BPPARAM$RNGseed <- seed + + # hardcode to streamline + limQ2 <- 0.0975 X <- .check_numeric_matrix(X, block_name = 'X') Y <- .check_numeric_matrix(Y, block_name = 'Y') @@ -156,6 +199,20 @@ tune.spls <- nrepeat <- check_cv$nrepeat folds <- check_cv$folds + #-- test.keepX + #-> if test.keepX set to NULL run perf() i.e. parameter tuning of just ncomp using all features + if (is.null(test.keepX) && is.null(test.keepY)){ + print("test.keepX and test.keepY are set to NULL, tuning only for number of components...") + spls_res <- spls(X, Y, ncomp, keepX = ncol(X), keepY = ncol(Y), + mode = mode, scale = scale, logratio = logratio, tol = tol, max.iter = max.iter, near.zero.var = near.zero.var, multilevel = multilevel) + perf_res <- perf(spls_res, + validation = validation, folds = folds, nrepeat = nrepeat, + dist = dist, + BPPARAM = BPPARAM, seed = seed, progressBar = progressBar) + return(perf_res) + + } else { + test.keepX <- .change_if_null(arg = test.keepX, default = ncol(X)) test.keepY <- .change_if_null(arg = test.keepY, default = ncol(Y)) @@ -244,7 +301,11 @@ tune.spls <- pls.model <- spls(X = X, Y = Y, keepX = c(choice.keepX, keepX), keepY = c(choice.keepY, keepY), - ncomp = comp, mode = mode, ...) + ncomp = comp, mode = mode, + scale = scale, tol = tol, max.iter = max.iter, + near.zero.var = near.zero.var, logratio = logratio, + multilevel = multilevel, + ...) # run perf in serial to avoid nested parallel processes, but make sure seed is passed into perf pls.perf <- perf(pls.model, validation = validation, folds = folds, nrepeat = nrepeat, @@ -456,4 +517,5 @@ tune.spls <- class <- c('tune.pls') class(out) <- if (spls.model) class <- c(class, 'tune.spls') return(out) + } } diff --git a/R/tune.splsda.R b/R/tune.splsda.R index 7f7ddcd6..cbbd143c 100644 --- a/R/tune.splsda.R +++ b/R/tune.splsda.R @@ -1,13 +1,12 @@ # ======================================================================================================== -# tune.splsda: chose the optimal number of parameters per component on a splsda method +# tune.splsda: tuning hyperparameters on a splsda method # ======================================================================================================== #' Tuning functions for sPLS-DA method #' #' Computes M-fold or Leave-One-Out Cross-Validation scores on a user-input -#' grid to determine optimal values for the sparsity parameters in -#' \code{splsda}. -#' +#' grid to determine optimal values for the parameters in \code{splsda}. +#' #' #' This tuning function should be used to tune the parameters in the #' \code{splsda} function (number of components and number of variables in @@ -69,12 +68,19 @@ #' More details about the prediction distances in \code{?predict} and the #' supplemental material of the mixOmics article (Rohart et al. 2017). #' +#' If test.keepX is set to NULL, the \code{perf()} function will be run internally, +#' which performs cross-validation to identify optimal number of components and +#' distance measure. Running tuning initially using \code{test.keepX = NULL} speeds +#' up the parameter tuning workflow, as then a lower ncomp value can be used for +#' variable selection tuning. +#' #' @param X numeric matrix of predictors. \code{NA}s are allowed. #' @param Y \code{if(method = 'spls')} numeric vector or matrix of continuous #' responses (for multi-response models) \code{NA}s are allowed. #' @param ncomp the number of components to include in the model. #' @param test.keepX numeric vector for the different number of variables to -#' test from the \eqn{X} data set +#' test from the \eqn{X} data set. If set to NULL, tuning will be performed on ncomp +#' using all variables in the \eqn{X} data set. #' @param already.tested.X Optional, if \code{ncomp > 1} A numeric vector #' indicating the number of variables to select from the \eqn{X} data set on #' the firsts components. @@ -83,11 +89,12 @@ #' Default is \code{"Mfold"}. #' @param folds the folds in the Mfold cross-validation. See Details. #' @param dist distance metric to use for \code{splsda} to estimate the -#' classification error rate, should be a subset of \code{"centroids.dist"}, -#' \code{"mahalanobis.dist"} or \code{"max.dist"} (see Details). +#' classification error rate, should one of \code{"centroids.dist"}, +#' \code{"mahalanobis.dist"} or \code{"max.dist"} (see Details). +#' If \code{test.keepX = NULL} multiple distances can be inputted or "all". #' @param measure Three misclassification measure are available: overall #' misclassification error \code{overall}, the Balanced Error Rate \code{BER} -#' or the Area Under the Curve \code{AUC} +#' or the Area Under the Curve \code{AUC}. Only used when \code{test.keepX} is not NULL. #' @param scale Logical. If scale = TRUE, each block is standardized to zero #' means and unit variances (default: TRUE) #' @param auc if \code{TRUE} calculate the Area Under the Curve (AUC) @@ -122,6 +129,14 @@ #' \item{error.rate.class}{returns the error rate for each level of \code{Y} #' and for each component computed with the optimal keepX} #' +#' If test.keepX = FALSE,produces a matrix of classification +#' error rate estimation. The dimensions correspond to the components in the +#' model and to the prediction method used, respectively. Note that error rates +#' reported in any component include the performance of the model in earlier +#' components for the specified \code{keepX} parameters (e.g. error rate +#' reported for component 3 for \code{keepX = 20} already includes the fitted +#' model on components 1 and 2 for \code{keepX = 20}). +#' #' \item{predict}{Prediction values for each sample, each \code{test.keepX}, #' each comp and each repeat. Only if light.output=FALSE} #' \item{class}{Predicted class for each sample, each \code{test.keepX}, each @@ -143,72 +158,41 @@ #' e1005752 #' @keywords regression multivariate #' @export -#' @examples -#' ## First example: analysis with sPLS-DA -#' -#' data(breast.tumors) -#' X = breast.tumors$gene.exp -#' Y = as.factor(breast.tumors$sample$treatment) -#' tune = tune.splsda(X, Y, ncomp = 1, nrepeat = 10, logratio = "none", -#' test.keepX = c(5, 10, 15), folds = 10, dist = "max.dist", -#' progressBar = TRUE) -#' -#' \dontrun{ -#' # 5 components, optimising 'keepX' and 'ncomp' -#' tune = tune.splsda(X, Y, ncomp = 5, test.keepX = c(5, 10, 15), -#' folds = 10, dist = "max.dist", nrepeat = 5, progressBar = FALSE) -#' -#' tune$choice.ncomp -#' tune$choice.keepX -#' plot(tune) -#' -#' ## only tune component 3 and 4 -#' # keeping 5 and 10 variables on the first two components respectively -#' -#' tune = tune.splsda(X = X,Y = Y, ncomp = 4, -#' already.tested.X = c(5,10), -#' test.keepX = seq(1,10,2), progressBar = TRUE) -#' -#' ## Second example: multilevel one-factor analysis with sPLS-DA -#' -#' data(vac18) -#' X = vac18$genes -#' Y = vac18$stimulation -#' # sample indicates the repeated measurements -#' design = data.frame(sample = vac18$sample) -#' -#' tune = tune.splsda(X, Y = Y, ncomp = 3, nrepeat = 10, logratio = "none", -#' test.keepX = c(5,50,100),folds = 10, dist = "max.dist", multilevel = design) -#' -#' } +#' @example ./examples/tune.splsda-examples.R + tune.splsda <- function (X, Y, ncomp = 1, - test.keepX = c(5, 10, 15), + # sparsity params + test.keepX = NULL, # in which case perf() is run internally to only tune ncomp already.tested.X, + # params related to spls model building + scale = TRUE, + logratio = c('none','CLR'), + max.iter = 100, + tol = 1e-06, + near.zero.var = FALSE, + multilevel = NULL, + # params related to CV validation = "Mfold", folds = 10, + nrepeat = 1, + signif.threshold = 0.01, + # params related to PA dist = "max.dist", - measure = "BER", # one of c("overall","BER") - scale = TRUE, + measure = "BER", # or 'overall' auc = FALSE, + # params related to running progressBar = FALSE, - tol = 1e-06, - max.iter = 100, - near.zero.var = FALSE, - nrepeat = 1, - logratio = c('none','CLR'), - multilevel = NULL, light.output = TRUE, - signif.threshold = 0.01, BPPARAM = SerialParam(), seed = NULL ) - { #-- checking general input parameters --------------------------------------# + { #-- checking general input parameters --------------------------------------# #---------------------------------------------------------------------------# - BPPARAM$RNGseed <- seed - set.seed(seed) + BPPARAM$RNGseed <- seed + set.seed(seed) #-- check significance threshold signif.threshold <- .check_alpha(signif.threshold) @@ -251,9 +235,6 @@ tune.splsda <- if (!is.null(ncol(Y)) && !ncol(Y) %in% c(0,1,2))# multilevel 1 or 2 factors stop("'Y' should either be a factor, a single column data.frame containing a factor, or a 2-columns data.frame containing 2 factors.") - multilevel = data.frame(multilevel, Y) - multilevel[, 1] = as.numeric(factor(multilevel[, 1])) # we want numbers for the repeated measurements - } @@ -281,12 +262,11 @@ tune.splsda <- if (nrepeat < 3 & ncomp > 1) message("Note that the number of components cannot be reliably tuned with nrepeat < 3 or validaion = 'loo'.") + #-- logratio logratio <- match.arg(logratio) - - #if ((!is.null(already.tested.X)) && (length(already.tested.X) != (ncomp - 1)) ) - #stop("The number of already tested parameters should be NULL or ", ncomp - 1, " since you set ncomp = ", ncomp) - + + #--already.tested.X if (missing(already.tested.X)) { already.tested.X = NULL @@ -311,295 +291,317 @@ tune.splsda <- if (any(is.na(validation)) || length(validation) > 1) stop("'validation' should be one of 'Mfold' or 'loo'.", call. = FALSE) - - #-- test.keepX - if (is.null(test.keepX) | length(test.keepX) == 1 | !is.numeric(test.keepX)) - stop("'test.keepX' must be a numeric vector with more than two entries", call. = FALSE) - - # remove some test.keepX if needed - if (any(test.keepX > ncol(X))){ - test.keepX = test.keepX[-which(test.keepX>ncol(X))] - if (length(test.keepX) < 2) - stop("Some entries of 'test.keepX' were higher than the number of - variables in 'X' and were removed, the resulting 'test.keepX' has now - too few entries (<2)", call. = FALSE) - } - - # add colnames and rownames if missing - X.names = dimnames(X)[[2]] - if (is.null(X.names)) - { - X.names = paste0("X", 1:ncol(X)) - dimnames(X)[[2]] = X.names - } - - ind.names = dimnames(X)[[1]] - if (is.null(ind.names)) - { - ind.names = 1:nrow(X) - rownames(X) = ind.names - } - - if (length(unique(rownames(X))) != nrow(X)) - stop("samples should have a unique identifier/rowname") - if (length(unique(X.names)) != ncol(X)) - stop("Unique indentifier is needed for the columns of X") - - rm(X.names);rm(ind.names) - - #-- end checking --# - #------------------# - - - #---------------------------------------------------------------------------# - #-- logration + multilevel approach ----------------------------------------# - # we can do logratio and multilevel on the whole data as these transformation are done per sample - X = logratio.transfo(X = X, logratio = logratio) - - if (!is.null(multilevel)) # logratio is applied per sample, multilevel as well, so both can be done on the whole data - { + + #-- test.keepX + #-> if test.keepX set to NULL run perf() i.e. parameter tuning of just ncomp + if (is.null(test.keepX)){ + print("test.keepX set to NULL, tuning only for number of components...") + splsda_res <- splsda(X, Y, ncomp, scale = scale, tol = tol, max.iter = max.iter, + near.zero.var = near.zero.var, logratio = logratio, multilevel = multilevel) + perf_res <- perf(splsda_res, + validation = validation, folds = folds, nrepeat = nrepeat, + dist = dist, signif.threshold = signif.threshold, + BPPARAM = BPPARAM, seed = seed, progressBar = progressBar, + auc = auc) + return(perf_res) + #-> if test.keepX is not NULL, run tune function as before + } else { + + # reformat multilevel design for tuning variables + if (!is.null(multilevel)){ + multilevel = data.frame(multilevel, Y) + multilevel[, 1] = as.numeric(factor(multilevel[, 1])) # we want numbers for the repeated measurements + } - Xw = withinVariation(X, design = multilevel) - X = Xw + #-- test.keepX + if (length(test.keepX) == 1 | !is.numeric(test.keepX)) + stop("'test.keepX' must be a numeric vector with more than two entries", call. = FALSE) - #-- Need to set Y variable for 1 or 2 factors - Y = multilevel[, -1, drop=FALSE] - if (ncol(Y) >= 1) - Y = apply(Y, 1, paste, collapse = ".") # paste is to combine in the case we have 2 levels + # remove some test.keepX if needed + if (any(test.keepX > ncol(X))){ + test.keepX = test.keepX[-which(test.keepX>ncol(X))] + if (length(test.keepX) < 2) + stop("Some entries of 'test.keepX' were higher than the number of + variables in 'X' and were removed, the resulting 'test.keepX' has now + too few entries (<2)", call. = FALSE) + } - Y = as.factor(Y) - } - #-- logration + multilevel approach ----------------------------------------# - #---------------------------------------------------------------------------# - - - #---------------------------------------------------------------------------# - #-- NA calculation ----------------------------------------------------# - - misdata = c(X=anyNA(X), Y=FALSE) # Detection of missing data. we assume no missing values in the factor Y - - if (any(misdata)) - { - is.na.A = is.na(X) + # add colnames and rownames if missing + X.names = dimnames(X)[[2]] + if (is.null(X.names)) + { + X.names = paste0("X", 1:ncol(X)) + dimnames(X)[[2]] = X.names + } - #ind.NA = which(apply(is.na.A, 1, sum) > 0) # calculated only once - #ind.NA.col = which(apply(is.na.A, 2, sum) >0) # indice of the col that have missing values. used in the deflation - } else { - is.na.A = NULL - #ind.NA = ind.NA.col = NULL - } - #-- NA calculation ----------------------------------------------------# - #---------------------------------------------------------------------------# - - - - test.keepX = sort(unique(test.keepX)) #sort test.keepX so as to be sure to chose the smallest in case of several minimum - names(test.keepX) = test.keepX - # if some components have already been tuned (eg comp1 and comp2), we're only tuning the following ones (comp3 comp4 .. ncomp) - if ((!is.null(already.tested.X))) - { - comp.real = (length(already.tested.X) + 1):ncomp - } else { - comp.real = 1:ncomp - } - - - choices = c("all", "max.dist", "centroids.dist", "mahalanobis.dist") - dist = match.arg(dist, choices, several.ok = TRUE) - - - mat.error.rate = list() - error.per.class = list() - AUC = list() - - if(nrepeat>1 & all(measure != "AUC")){ - mat.sd.error = matrix(0,nrow = length(test.keepX), ncol = ncomp-length(already.tested.X), - dimnames = list(c(test.keepX), c(paste0('comp', comp.real)))) - } else{ - mat.sd.error=NULL - } - mat.mean.error = matrix(nrow = length(test.keepX), ncol = ncomp-length(already.tested.X), - dimnames = list(c(test.keepX), c(paste0('comp', comp.real)))) - - # first: near zero var on the whole data set - if(near.zero.var == TRUE) - { - nzv = nearZeroVar(X) - if (length(nzv$Position > 0)) + ind.names = dimnames(X)[[1]] + if (is.null(ind.names)) { - warning("Zero- or near-zero variance predictors.\nReset predictors matrix to not near-zero variance predictors.\nSee $nzv for problematic predictors.") - X = X[, -nzv$Position, drop=TRUE] + ind.names = 1:nrow(X) + rownames(X) = ind.names + } + + if (length(unique(rownames(X))) != nrow(X)) + stop("samples should have a unique identifier/rowname") + if (length(unique(X.names)) != ncol(X)) + stop("Unique indentifier is needed for the columns of X") + + rm(X.names);rm(ind.names) + + #-- end checking --# + #------------------# + + + #---------------------------------------------------------------------------# + #-- logration + multilevel approach ----------------------------------------# + # we can do logratio and multilevel on the whole data as these transformation are done per sample + X = logratio.transfo(X = X, logratio = logratio) + + if (!is.null(multilevel)) # logratio is applied per sample, multilevel as well, so both can be done on the whole data + { + + Xw = withinVariation(X, design = multilevel) + X = Xw - if(ncol(X)==0) - stop("No more predictors after Near Zero Var has been applied!") + #-- Need to set Y variable for 1 or 2 factors + Y = multilevel[, -1, drop=FALSE] + if (ncol(Y) >= 1) + Y = apply(Y, 1, paste, collapse = ".") # paste is to combine in the case we have 2 levels + Y = as.factor(Y) } - } - - if (is.null(multilevel) | (!is.null(multilevel) && ncol(multilevel) == 2)) - { - if(light.output == FALSE) - prediction.all = class.all = list() + #-- logration + multilevel approach ----------------------------------------# + #---------------------------------------------------------------------------# + + + #---------------------------------------------------------------------------# + #-- NA calculation ----------------------------------------------------# - if(auc) + misdata = c(X=anyNA(X), Y=FALSE) # Detection of missing data. we assume no missing values in the factor Y + + if (any(misdata)) { - auc.mean.sd=list() - if(light.output == FALSE) - auc.all=list() + is.na.A = is.na(X) + + #ind.NA = which(apply(is.na.A, 1, sum) > 0) # calculated only once + #ind.NA.col = which(apply(is.na.A, 2, sum) >0) # indice of the col that have missing values. used in the deflation + } else { + is.na.A = NULL + #ind.NA = ind.NA.col = NULL } + #-- NA calculation ----------------------------------------------------# + #---------------------------------------------------------------------------# - class.object=c("mixo_splsda","DA") - error.per.class.keepX.opt = list() - - if(all(measure != "AUC")){ - error.per.class.keepX.opt.mean = matrix(0, nrow = nlevels(Y), ncol = length(comp.real), - dimnames = list(c(levels(Y)), c(paste0('comp', comp.real)))) + + + test.keepX = sort(unique(test.keepX)) #sort test.keepX so as to be sure to chose the smallest in case of several minimum + names(test.keepX) = test.keepX + # if some components have already been tuned (eg comp1 and comp2), we're only tuning the following ones (comp3 comp4 .. ncomp) + if ((!is.null(already.tested.X))) + { + comp.real = (length(already.tested.X) + 1):ncomp } else { - error.per.class.keepX.opt.mean=NULL + comp.real = 1:ncomp + } + + + choices = c("all", "max.dist", "centroids.dist", "mahalanobis.dist") + dist = match.arg(dist, choices, several.ok = TRUE) + + + mat.error.rate = list() + error.per.class = list() + AUC = list() + + if(nrepeat>1 & all(measure != "AUC")){ + mat.sd.error = matrix(0,nrow = length(test.keepX), ncol = ncomp-length(already.tested.X), + dimnames = list(c(test.keepX), c(paste0('comp', comp.real)))) + } else{ + mat.sd.error=NULL } - # successively tune the components until ncomp: comp1, then comp2, ... - for(comp in 1:length(comp.real)) + mat.mean.error = matrix(nrow = length(test.keepX), ncol = ncomp-length(already.tested.X), + dimnames = list(c(test.keepX), c(paste0('comp', comp.real)))) + + # first: near zero var on the whole data set + if(near.zero.var == TRUE) { - - if (progressBar == TRUE & is(BPPARAM, 'SerialParam')) - cat("\ncomp",comp.real[comp], "\n") - - result = MCVfold.spls(X, Y, multilevel = multilevel, validation = validation, folds = folds, nrepeat = nrepeat, ncomp = 1 + length(already.tested.X), - choice.keepX = already.tested.X, - test.keepX = test.keepX, test.keepY = nlevels(Y), measure = measure, dist = dist, scale=scale, - near.zero.var = near.zero.var, progressBar = progressBar, tol = tol, max.iter = max.iter, auc = auc, - BPPARAM = BPPARAM, - misdata = misdata, is.na.A = is.na.A, class.object=class.object) - - # in the following, there is [[1]] because 'tune' is working with only 1 distance and 'MCVfold.spls' can work with multiple distances - mat.error.rate[[comp]] = result[[measure]]$mat.error.rate[[1]] - mat.mean.error[, comp]=result[[measure]]$error.rate.mean[[1]] - if (!is.null(result[[measure]]$error.rate.sd[[1]])) - mat.sd.error[, comp]=result[[measure]]$error.rate.sd[[1]] - - # confusion matrix for keepX.opt - if (all(measure!="AUC")){ - error.per.class.keepX.opt[[comp]]=result[[measure]]$confusion[[1]] - error.per.class.keepX.opt.mean[, comp]=apply(result[[measure]]$confusion[[1]], 1, mean) - } - - # best keepX - already.tested.X = c(already.tested.X, result[[measure]]$keepX.opt[[1]]) - - if(light.output == FALSE) + nzv = nearZeroVar(X) + if (length(nzv$Position > 0)) { - #prediction of each samples for each fold and each repeat, on each comp - class.all[[comp]] = result$class.comp[[1]] - prediction.all[[comp]] = result$prediction.comp + warning("Zero- or near-zero variance predictors.\nReset predictors matrix to not near-zero variance predictors.\nSee $nzv for problematic predictors.") + X = X[, -nzv$Position, drop=TRUE] + + if(ncol(X)==0) + stop("No more predictors after Near Zero Var has been applied!") + } + } + + if (is.null(multilevel) | (!is.null(multilevel) && ncol(multilevel) == 2)) + { + if(light.output == FALSE) + prediction.all = class.all = list() if(auc) { - auc.mean.sd[[comp]] = result$auc + auc.mean.sd=list() if(light.output == FALSE) - auc.all[[comp]] = result$auc.all + auc.all=list() } - } # end comp - - names(mat.error.rate) = c(paste0('comp', comp.real)) - if (all(measure!="AUC")) - names(error.per.class.keepX.opt) = c(paste0('comp', comp.real)) - - names(already.tested.X) = c(paste0('comp', 1:ncomp)) - - if (progressBar == TRUE) - cat('\n') - - # calculating the number of optimal component based on t.tests and the error.rate.all, if more than 3 error.rates(repeat>3) - if(nrepeat > 2 & length(comp.real) >1 & all(measure!="AUC")) - { - keepX = already.tested.X - error.keepX = NULL + class.object=c("mixo_splsda","DA") + error.per.class.keepX.opt = list() + + if(all(measure != "AUC")){ + error.per.class.keepX.opt.mean = matrix(0, nrow = nlevels(Y), ncol = length(comp.real), + dimnames = list(c(levels(Y)), c(paste0('comp', comp.real)))) + } else { + error.per.class.keepX.opt.mean=NULL + } + # successively tune the components until ncomp: comp1, then comp2, ... for(comp in 1:length(comp.real)) { - ind.row = match(keepX[[comp.real[comp]]],test.keepX) - error.keepX = cbind(error.keepX, mat.error.rate[[comp]][ind.row,]) + + if (progressBar == TRUE & is(BPPARAM, 'SerialParam')) + cat("\ncomp",comp.real[comp], "\n") + + result = MCVfold.spls(X, Y, multilevel = multilevel, validation = validation, folds = folds, nrepeat = nrepeat, ncomp = 1 + length(already.tested.X), + choice.keepX = already.tested.X, + test.keepX = test.keepX, test.keepY = nlevels(Y), measure = measure, dist = dist, scale=scale, + near.zero.var = near.zero.var, progressBar = progressBar, tol = tol, max.iter = max.iter, auc = auc, + BPPARAM = BPPARAM, + misdata = misdata, is.na.A = is.na.A, class.object=class.object) + + # in the following, there is [[1]] because 'tune' is working with only 1 distance and 'MCVfold.spls' can work with multiple distances + mat.error.rate[[comp]] = result[[measure]]$mat.error.rate[[1]] + mat.mean.error[, comp]=result[[measure]]$error.rate.mean[[1]] + if (!is.null(result[[measure]]$error.rate.sd[[1]])) + mat.sd.error[, comp]=result[[measure]]$error.rate.sd[[1]] + + # confusion matrix for keepX.opt + if (all(measure!="AUC")){ + error.per.class.keepX.opt[[comp]]=result[[measure]]$confusion[[1]] + error.per.class.keepX.opt.mean[, comp]=apply(result[[measure]]$confusion[[1]], 1, mean) + } + + # best keepX + already.tested.X = c(already.tested.X, result[[measure]]$keepX.opt[[1]]) + + if(light.output == FALSE) + { + #prediction of each samples for each fold and each repeat, on each comp + class.all[[comp]] = result$class.comp[[1]] + prediction.all[[comp]] = result$prediction.comp + } + + if(auc) + { + auc.mean.sd[[comp]] = result$auc + if(light.output == FALSE) + auc.all[[comp]] = result$auc.all + } + + } # end comp + + names(mat.error.rate) = c(paste0('comp', comp.real)) + if (all(measure!="AUC")) + names(error.per.class.keepX.opt) = c(paste0('comp', comp.real)) + + names(already.tested.X) = c(paste0('comp', 1:ncomp)) + + if (progressBar == TRUE) + cat('\n') + + # calculating the number of optimal component based on t.tests and the error.rate.all, if more than 3 error.rates(repeat>3) + if(nrepeat > 2 & length(comp.real) >1 & all(measure!="AUC")) + { + keepX = already.tested.X + error.keepX = NULL + for(comp in 1:length(comp.real)) + { + ind.row = match(keepX[[comp.real[comp]]],test.keepX) + error.keepX = cbind(error.keepX, mat.error.rate[[comp]][ind.row,]) + } + colnames(error.keepX) = c(paste0('comp', comp.real)) + + opt = t.test.process(error.keepX) + + ncomp_opt = comp.real[opt] + } else if(nrepeat > 2 & length(comp.real) >1 & all(measure=="AUC")){ + #hacking t.test.process for AUC + keepX = already.tested.X + error.keepX = NULL + for(comp in 1:length(comp.real)) + { + ind.row = match(keepX[[comp.real[comp]]],test.keepX) + error.keepX = cbind(error.keepX, mat.error.rate[[comp]][ind.row,]) + } + colnames(error.keepX) = c(paste0('comp', comp.real)) + + #1- to change the AUC to an 'error' and check for decrease + opt = t.test.process(1-error.keepX, alpha = signif.threshold) + + ncomp_opt = comp.real[opt] + } else { + ncomp_opt = error.keepX = NULL } - colnames(error.keepX) = c(paste0('comp', comp.real)) - opt = t.test.process(error.keepX) + result = list( + error.rate = mat.mean.error, + error.rate.sd = mat.sd.error, + error.rate.all = mat.error.rate, + choice.keepX = already.tested.X, + choice.ncomp = list(ncomp = ncomp_opt, values = error.keepX), + error.rate.class = error.per.class.keepX.opt.mean, + error.rate.class.all = error.per.class.keepX.opt) - ncomp_opt = comp.real[opt] - } else if(nrepeat > 2 & length(comp.real) >1 & all(measure=="AUC")){ - #hacking t.test.process for AUC - keepX = already.tested.X - error.keepX = NULL - for(comp in 1:length(comp.real)) + if(light.output == FALSE) { - ind.row = match(keepX[[comp.real[comp]]],test.keepX) - error.keepX = cbind(error.keepX, mat.error.rate[[comp]][ind.row,]) + names(class.all) = names(prediction.all) = c(paste0('comp', comp.real)) + result$predict = prediction.all + result$class = class.all } - colnames(error.keepX) = c(paste0('comp', comp.real)) + if(auc) + { + + #we add the AUC outputs only if it was not the measure + #otherwise there is twice the same output + if(all(measure != "AUC")){ + names(auc.mean.sd) = c(paste0('comp', comp.real)) + result$auc = auc.mean.sd + } + if(light.output == FALSE) + { + names(auc.all) = c(paste0('comp', comp.real)) + result$auc.all =auc.all + } + } + result$measure = measure + result$call = match.call() + result$call$nrepeat = nrepeat + result$nrepeat = nrepeat - #1- to change the AUC to an 'error' and check for decrease - opt = t.test.process(1-error.keepX, alpha = signif.threshold) + class(result) = "tune.splsda" - ncomp_opt = comp.real[opt] + return(result) } else { - ncomp_opt = error.keepX = NULL - } - - result = list( - error.rate = mat.mean.error, - error.rate.sd = mat.sd.error, - error.rate.all = mat.error.rate, - choice.keepX = already.tested.X, - choice.ncomp = list(ncomp = ncomp_opt, values = error.keepX), - error.rate.class = error.per.class.keepX.opt.mean, - error.rate.class.all = error.per.class.keepX.opt) - - if(light.output == FALSE) - { - names(class.all) = names(prediction.all) = c(paste0('comp', comp.real)) - result$predict = prediction.all - result$class = class.all - } - if(auc) - { + # if multilevel with 2 factors, we can not do as before because withinvariation depends on the factors, we maximase a correlation + message("For a two-factor analysis, the tuning criterion is based on the maximisation of the correlation between the components on the whole data set") - #we add the AUC outputs only if it was not the measure - #otherwise there is twice the same output - if(all(measure != "AUC")){ - names(auc.mean.sd) = c(paste0('comp', comp.real)) - result$auc = auc.mean.sd - } - if(light.output == FALSE) + cor.value = vector(length = length(test.keepX)) + names(cor.value) = test.keepX + + for (i in 1:length(test.keepX)) { - names(auc.all) = c(paste0('comp', comp.real)) - result$auc.all =auc.all + spls.train = splsda(X, Y, ncomp = ncomp, keepX = c(already.tested.X, test.keepX[i]), logratio = logratio, near.zero.var = FALSE) + + # Note: this is performed on the full data set + # (could be done with resampling (bootstrap) (option 1) and/or prediction (option 2)) + cor.value[i] = cor(spls.train$variates$X[, ncomp], spls.train$variates$Y[, ncomp]) + # } - } - result$measure = measure - result$call = match.call() - result$call$nrepeat = nrepeat - result$nrepeat = nrepeat - - class(result) = "tune.splsda" - - return(result) - } else { - # if multilevel with 2 factors, we can not do as before because withinvariation depends on the factors, we maximase a correlation - message("For a two-factor analysis, the tuning criterion is based on the maximisation of the correlation between the components on the whole data set") - - cor.value = vector(length = length(test.keepX)) - names(cor.value) = test.keepX - - for (i in 1:length(test.keepX)) - { - spls.train = splsda(X, Y, ncomp = ncomp, keepX = c(already.tested.X, test.keepX[i]), logratio = logratio, near.zero.var = FALSE) + return(list(cor.value = cor.value)) - # Note: this is performed on the full data set - # (could be done with resampling (bootstrap) (option 1) and/or prediction (option 2)) - cor.value[i] = cor(spls.train$variates$X[, ncomp], spls.train$variates$Y[, ncomp]) - # } - return(list(cor.value = cor.value)) - - } + } # end else statement (i.e. test.keepX is not NULL) } diff --git a/examples/perf.assess-examples.R b/examples/perf.assess-examples.R new file mode 100644 index 00000000..30a33fd2 --- /dev/null +++ b/examples/perf.assess-examples.R @@ -0,0 +1,96 @@ +## PLS-DA example + +data(liver.toxicity) # rats gex and clinical measurements/treatments +unique(liver.toxicity$treatment$Treatment.Group) # 16 groups +length(liver.toxicity$treatment$Treatment.Group) # 64 samples + +plsda.res <- plsda(liver.toxicity$gene, liver.toxicity$treatment$Treatment.Group, ncomp = 2) + +performance <- perf.assess(plsda.res, + # to make sure each fold has all classes represented + validation = "Mfold", folds = 3, nrepeat = 10, + seed = 12) # for reproducibility, remove for analysis + +performance$error.rate$BER + +## sPLS-DA example + +splsda.res <- splsda(liver.toxicity$gene, liver.toxicity$treatment$Treatment.Group, + keepX = c(25, 25), ncomp = 2) + +performance <- perf.assess(splsda.res, + validation = "Mfold", folds = 3, nrepeat = 10, + seed = 12) + +performance$error.rate$BER # can see slight improvement in error rate over PLS-DA example + +## PLS example + +ncol(liver.toxicity$clinic) # 10 Y variables as output of PLS model + +pls.res <- pls(liver.toxicity$gene, liver.toxicity$clinic, ncomp = 2) + +performance <- perf.assess(pls.res, + validation = "Mfold", folds = 3, nrepeat = 10, + seed = 12) + +# see Q2 which gives indication of predictive ability for each of the 10 Y outputs +performance$measures$Q2$summary + +## sPLS example + +spls.res <- spls(liver.toxicity$gene, liver.toxicity$clinic, ncomp = 2, keepX = c(50, 50)) + +performance <- perf.assess(spls.res, + validation = "Mfold", folds = 3, nrepeat = 10, + seed = 12) + +# see Q2 which gives indication of predictive ability for each of the 10 Y outputs +performance$measures$Q2$summary + + +## block PLS-DA example + +data("breast.TCGA") +mrna <- breast.TCGA$data.train$mrna +mirna <- breast.TCGA$data.train$mirna +data <- list(mrna = mrna, mirna = mirna) +design <- matrix(1, ncol = length(data), nrow = length(data), dimnames = list(names(data), names(data))) +diag(design) <- 0 + +block.plsda.res <- block.plsda(X = data, Y = breast.TCGA$data.train$subtype, + ncomp = 2, design = design) + +performance <- perf.assess(block.plsda.res) + +performance$error.rate.per.class # error rate per class per distance metric + +## block sPLS-DA example + +block.splsda.res <- block.splsda(X = data, Y = breast.TCGA$data.train$subtype, + ncomp = 2, design = design, + keepX = list(mrna = c(8,8), mirna = c(8,8))) + +performance <- perf.assess(block.splsda.res) + +performance$error.rate.per.class + +## MINT PLS-DA example + +data("stemcells") + +mint.plsda.res <- mint.plsda(X = stemcells$gene, Y = stemcells$celltype, ncomp = 3, + study = stemcells$study) + +performance <- perf.assess(mint.plsda.res) + +performance$global.error$BER # global error per distance metric + +## MINT sPLS-DA example + +mint.splsda.res <- mint.splsda(X = stemcells$gene, Y = stemcells$celltype, ncomp = 3, + keepX = c(10, 5, 15), study = stemcells$study) + +performance <- perf.assess(mint.splsda.res) + +performance$global.error$BER # error slightly higher in this sparse model verses non-sparse \ No newline at end of file diff --git a/examples/tune.block.plsda-examples.R b/examples/tune.block.plsda-examples.R new file mode 100644 index 00000000..c84e9f5f --- /dev/null +++ b/examples/tune.block.plsda-examples.R @@ -0,0 +1,49 @@ +data("breast.TCGA") + +# X data - list of mRNA and miRNA +X <- list(mrna = breast.TCGA$data.train$mrna, mirna = breast.TCGA$data.train$mirna, + protein = breast.TCGA$data.train$protein) + +# Y data - single data set of proteins +Y <- breast.TCGA$data.train$subtype + +# subset the X and Y data to speed up computation in this example +set.seed(100) +subset <- mixOmics:::stratified.subsampling(breast.TCGA$data.train$subtype, folds = 3)[[1]][[1]] +X <- lapply(X, function(omic) omic[subset,]) +Y <- Y[subset] + +# set up a full design where every block is connected +# could also consider other weights, see our mixOmics manuscript +design = matrix(1, ncol = length(X), nrow = length(X), + dimnames = list(names(X), names(X))) +diag(design) = 0 +design + +## Tune number of components to keep - use all distance metrics +tune_res <- tune.block.plsda(X, Y, design = design, + ncomp = 5, + nrepeat = 3, + seed = 13, + dist = c("all")) + +plot(tune_res) +tune_res$choice.ncomp # 3 components best for max.dist, 1 for centroids.dist + + +## Tune number of components to keep - use weighted vote rather than majority vote +tune_res <- tune.block.plsda(X, Y, design = design, + ncomp = 5, + nrepeat = 3, + seed = 13, + dist = c("all"), + weighted = FALSE) +tune_res$weights + +## Tune number of components to keep - plot just max.dist +tune_res <- tune.block.plsda(X, Y, design = design, + ncomp = 5, + nrepeat = 3, + seed = 13, + dist = c("max.dist")) +plot(tune_res) \ No newline at end of file diff --git a/examples/tune.block.splsda-examples.R b/examples/tune.block.splsda-examples.R index 67ce833a..dd60ebce 100644 --- a/examples/tune.block.splsda-examples.R +++ b/examples/tune.block.splsda-examples.R @@ -1,58 +1,67 @@ -\dontrun{ +## Set up data + +# load data data("breast.TCGA") -# this is the X data as a list of mRNA and miRNA; the Y data set is a single data set of proteins -data = list(mrna = breast.TCGA$data.train$mrna, mirna = breast.TCGA$data.train$mirna, - protein = breast.TCGA$data.train$protein) + +# X data - list of mRNA and miRNA +X <- list(mrna = breast.TCGA$data.train$mrna, mirna = breast.TCGA$data.train$mirna, + protein = breast.TCGA$data.train$protein) + +# Y data - single data set of proteins +Y <- breast.TCGA$data.train$subtype + +# subset the X and Y data to speed up computation in this example +set.seed(100) +subset <- mixOmics:::stratified.subsampling(breast.TCGA$data.train$subtype, folds = 3)[[1]][[1]] +X <- lapply(X, function(omic) omic[subset,]) +Y <- Y[subset] + # set up a full design where every block is connected # could also consider other weights, see our mixOmics manuscript -design = matrix(1, ncol = length(data), nrow = length(data), - dimnames = list(names(data), names(data))) +design = matrix(1, ncol = length(X), nrow = length(X), + dimnames = list(names(X), names(X))) diag(design) = 0 design -# set number of component per data set -ncomp = 3 -# Tuning the first two components -# ------------- -## Not run: +## Tune number of components to keep +tune_res <- tune.block.splsda(X, Y, design = design, + ncomp = 5, + test.keepX = NULL, + validation = "Mfold", nrepeat = 3, + dist = "all", measure = "BER", + seed = 13) + +plot(tune_res) + +tune_res$choice.ncomp # 3 components best + +## Tune number of variables to keep + # definition of the keepX value to be tested for each block mRNA miRNA and protein # names of test.keepX must match the names of 'data' test.keepX = list(mrna = c(10, 30), mirna = c(15, 25), protein = c(4, 8)) -# the following may take some time to run, so we subset the data first. -# Note that for thorough tuning, nrepeat should be >= 3 so that significance of -# the model improvement can be measured -## ---- subset by 3rd of samples -set.seed(100) -subset <- mixOmics:::stratified.subsampling(breast.TCGA$data.train$subtype, folds = 3)[[1]][[1]] -data <- lapply(data, function(omic) omic[subset,]) -Y <- breast.TCGA$data.train$subtype[subset] -## ---- run -# Check if the environment variable exists (during R CMD check) and limit cores accordingly -max_cores <- if (Sys.getenv("_R_CHECK_LIMIT_CORES_") != "") 2 else parallel::detectCores() - 1 -# Setup the parallel backend with the appropriate number of workers -BPPARAM <- BiocParallel::MulticoreParam(workers = max_cores) - -tune <- tune.block.splsda( - X = data, - Y = Y, - ncomp = ncomp, - test.keepX = test.keepX, - design = design, - nrepeat = 2, - BPPARAM = BPPARAM -) - -plot(tune) -tune$choice.ncomp -tune$choice.keepX +# load parallel package +library(BiocParallel) + +# run tuning in parallel on 2 cores, output plot on overall error +tune_res <- tune.block.splsda(X, Y, design = design, + ncomp = 2, + test.keepX = test.keepX, + validation = "Mfold", nrepeat = 3, + measure = "overall", + seed = 13, BPPARAM = SnowParam(workers = 2)) + +plot(tune_res) +tune_res$choice.keepX # Now tuning a new component given previous tuned keepX -already.tested.X = tune$choice.keepX -tune = tune.block.splsda(X = data, Y = Y, - ncomp = 4, test.keepX = test.keepX, design = design, - already.tested.X = already.tested.X, - BPPARAM = BPPARAM - ) -tune$choice.keepX -} \ No newline at end of file +already.tested.X <- tune_res$choice.keepX +tune_res <- tune.block.splsda(X, Y, design = design, + ncomp = 3, + test.keepX = test.keepX, + validation = "Mfold", nrepeat = 3, + measure = "overall", + seed = 13, BPPARAM = SnowParam(workers = 2), + already.tested.X = already.tested.X) +tune_res$choice.keepX \ No newline at end of file diff --git a/examples/tune.mint.plsda-examples.R b/examples/tune.mint.plsda-examples.R new file mode 100644 index 00000000..2c08350d --- /dev/null +++ b/examples/tune.mint.plsda-examples.R @@ -0,0 +1,13 @@ +# set up data +data(stemcells) +data <- stemcells$gene +type.id <- stemcells$celltype +exp <- stemcells$study + +# tune number of components +tune_res <- tune.mint.plsda(X = data,Y = type.id, ncomp=5, + near.zero.var=FALSE, + study=exp) + +plot(tune_res) +tune_res$choice.ncomp # 1 component \ No newline at end of file diff --git a/examples/tune.mint.splsda-examples.R b/examples/tune.mint.splsda-examples.R new file mode 100644 index 00000000..c8d10a6c --- /dev/null +++ b/examples/tune.mint.splsda-examples.R @@ -0,0 +1,30 @@ +# set up data +data(stemcells) +data <- stemcells$gene +type.id <- stemcells$celltype +exp <- stemcells$study + +# tune number of components +tune_res <- tune.mint.splsda(X = data,Y = type.id, ncomp=5, + near.zero.var=FALSE, + study=exp, + test.keepX = NULL) + +plot(tune_res) +tune_res$choice.ncomp # 1 component + +## tune number of variables to keep +tune_res <- tune.mint.splsda(X = data,Y = type.id, ncomp = 1, + near.zero.var = FALSE, + study=exp, + test.keepX=seq(1,10,1)) + +plot(tune_res) +tune_res$choice.keepX # 9 variables to keep on component 1 + +## only tune component 3 and keeping 10 genes on comp1 +tune_res <- tune.mint.splsda(X = data, Y = type.id, ncomp = 2, study = exp, + already.tested.X = c(9), + test.keepX = seq(1,10,1)) +plot(tune_res) +tune_res$choice.keepX # 10 variables to keep on comp2 \ No newline at end of file diff --git a/examples/tune.pca-examples.R b/examples/tune.pca-examples.R new file mode 100644 index 00000000..5f9ec17a --- /dev/null +++ b/examples/tune.pca-examples.R @@ -0,0 +1,17 @@ +# load data +data(liver.toxicity) + +# run tuning +tune <- tune.pca(liver.toxicity$gene, center = TRUE, scale = TRUE) +plot(tune) + +# set up multilevel dataset +repeat.indiv <- c(1, 2, 1, 2, 1, 2, 1, 2, 3, 3, 4, 3, 4, 3, 4, 4, 5, 6, 5, 5, + 6, 5, 6, 7, 7, 8, 6, 7, 8, 7, 8, 8, 9, 10, 9, 10, 11, 9, 9, + 10, 11, 12, 12, 10, 11, 12, 11, 12, 13, 14, 13, 14, 13, 14, + 13, 14, 15, 16, 15, 16, 15, 16, 15, 16) +design <- data.frame(sample = repeat.indiv) + +# run tuning +tune <- tune.pca(liver.toxicity$gene, center = TRUE, scale = TRUE, multilevel = design) +plot(tune) \ No newline at end of file diff --git a/examples/tune.pls-examples.R b/examples/tune.pls-examples.R new file mode 100644 index 00000000..bca5c900 --- /dev/null +++ b/examples/tune.pls-examples.R @@ -0,0 +1,29 @@ +# set up data +data(liver.toxicity) +X <- liver.toxicity$gene +Y <- liver.toxicity$clinic + +# tune PLS2 model to find optimal number of components +tune.res <- tune.pls(X, Y, ncomp = 10, measure = "cor", + folds = 5, nrepeat = 3, progressBar = TRUE) +plot(tune.res) # plot outputs + +# PLS1 model example +Y1 <- liver.toxicity$clinic[,1] + +tune.res <- tune.pls(X, Y1, ncomp = 10, measure = "cor", + folds = 5, nrepeat = 3, progressBar = TRUE) + +plot(tune.res) + +# Multilevel PLS2 model +repeat.indiv <- c(1, 2, 1, 2, 1, 2, 1, 2, 3, 3, 4, 3, 4, 3, 4, 4, 5, 6, 5, 5, + 6, 5, 6, 7, 7, 8, 6, 7, 8, 7, 8, 8, 9, 10, 9, 10, 11, 9, 9, + 10, 11, 12, 12, 10, 11, 12, 11, 12, 13, 14, 13, 14, 13, 14, + 13, 14, 15, 16, 15, 16, 15, 16, 15, 16) +design <- data.frame(sample = repeat.indiv) + +tune.res <- tune.pls(X, Y1, ncomp = 10, measure = "cor", multilevel = design, + folds = 5, nrepeat = 3, progressBar = TRUE) + +plot(tune.res) \ No newline at end of file diff --git a/examples/tune.plsda-examples.R b/examples/tune.plsda-examples.R new file mode 100644 index 00000000..b9a64cdb --- /dev/null +++ b/examples/tune.plsda-examples.R @@ -0,0 +1,21 @@ +## Example: analysis with PLS-DA +data(breast.tumors) + +# tune components and distance +tune = tune.plsda(breast.tumors$gene.exp, as.factor(breast.tumors$sample$treatment), + ncomp = 5, logratio = "none", + nrepeat = 10, folds = 10, + progressBar = TRUE, + seed = 20) # set for reproducibility of example only +plot(tune) # optimal distance = centroids.dist +tune$choice.ncomp # optimal component number = 3 + +## Example: multilevel PLS-DA +data(vac18) +design <- data.frame(sample = vac18$sample) # set the multilevel design + +tune1 <- tune.plsda(vac18$genes, vac18$stimulation, + ncomp = 5, multilevel = design, + nrepeat = 10, folds = 10, + seed = 20) +plot(tune1) \ No newline at end of file diff --git a/examples/tune.rcc-examples.R b/examples/tune.rcc-examples.R new file mode 100644 index 00000000..479450bd --- /dev/null +++ b/examples/tune.rcc-examples.R @@ -0,0 +1,10 @@ +#load data +data(nutrimouse) +X <- nutrimouse$lipid +Y <- nutrimouse$gene + +# run tuning +tune_res <- tune.rcc(X, Y, validation = "Mfold") + +# plot output +plot(tune_res) \ No newline at end of file diff --git a/examples/tune.spls-examples.R b/examples/tune.spls-examples.R new file mode 100644 index 00000000..07f8d948 --- /dev/null +++ b/examples/tune.spls-examples.R @@ -0,0 +1,59 @@ +## sPLS2 model example (more than one Y outcome variable) + +# set up data +data(liver.toxicity) +X <- liver.toxicity$gene +Y <- liver.toxicity$clinic + +# tune spls model for components only +tune.res.ncomp <- tune.spls( X, Y, ncomp = 5, + test.keepX = NULL, + test.keepY = NULL, measure = "cor", + folds = 5, nrepeat = 3, progressBar = TRUE) +plot(tune.res.ncomp) # plot outputs + +# tune spls model for number of X and Y variables to keep +tune.res <- tune.spls( X, Y, ncomp = 3, + test.keepX = c(5, 10, 15), + test.keepY = c(3, 6, 8), measure = "cor", + folds = 5, nrepeat = 3, progressBar = TRUE) +plot(tune.res) # plot outputs + +## sPLS1 model example (only one Y outcome variable) + +# set up data +Y1 <- liver.toxicity$clinic[,1] + +# tune spls model for components only +plot(tune.spls(X, Y1, ncomp = 3, + folds = 3, + test.keepX = NULL, test.keepY = NULL)) + +# tune spls model for number of X variables to keep, note for sPLS1 models 'measure' needs to be set +plot(tune.spls(X, Y1, ncomp = 3, + folds = 3, measure = "MSE", + test.keepX = c(5, 10, 15), test.keepY = c(3, 6, 8))) + + +## sPLS2 multilevel model example + +# set up multilevel design +repeat.indiv <- c(1, 2, 1, 2, 1, 2, 1, 2, 3, 3, 4, 3, 4, 3, 4, 4, 5, 6, 5, 5, + 6, 5, 6, 7, 7, 8, 6, 7, 8, 7, 8, 8, 9, 10, 9, 10, 11, 9, 9, + 10, 11, 12, 12, 10, 11, 12, 11, 12, 13, 14, 13, 14, 13, 14, + 13, 14, 15, 16, 15, 16, 15, 16, 15, 16) +design <- data.frame(sample = repeat.indiv) + +# tune spls model for components only +tune.res.ncomp <- tune.spls( X, Y, ncomp = 5, + test.keepX = NULL, + test.keepY = NULL, measure = "cor", multilevel = design, + folds = 5, nrepeat = 3, progressBar = TRUE) +plot(tune.res.ncomp) # plot outputs + +# tune spls model for number of X and Y variables to keep +tune.res <- tune.spls( X, Y, ncomp = 3, + test.keepX = c(5, 10, 15), + test.keepY = c(3, 6, 8), measure = "cor", multilevel = design, + folds = 5, nrepeat = 3, progressBar = TRUE) +plot(tune.res) # plot outputs diff --git a/examples/tune.splsda-examples.R b/examples/tune.splsda-examples.R new file mode 100644 index 00000000..36e2d585 --- /dev/null +++ b/examples/tune.splsda-examples.R @@ -0,0 +1,53 @@ +## First example: analysis with sPLS-DA +data(breast.tumors) +X = breast.tumors$gene.exp +Y = as.factor(breast.tumors$sample$treatment) + +# first tune on components only +tune = tune.splsda(X, Y, ncomp = 5, logratio = "none", + nrepeat = 10, folds = 10, + test.keepX = NULL, + dist = "all", + progressBar = TRUE, + seed = 20) # set for reproducibility of example only +plot(tune) # optimal distance = centroids.dist +tune$choice.ncomp # optimal component number = 3 + +# then tune optimal keepX for each component +tune = tune.splsda(X, Y, ncomp = 3, logratio = "none", + nrepeat = 10, folds = 10, + test.keepX = c(5, 10, 15), dist = "centroids.dist", + progressBar = TRUE, + seed = 20) + +plot(tune) +tune$choice.keepX # optimal number of variables to keep c(15, 5, 15) + +## With already tested variables: +tune = tune.splsda(X, Y, ncomp = 3, logratio = "none", + nrepeat = 10, folds = 10, + test.keepX = c(5, 10, 15), already.tested.X = c(5, 10), + dist = "centroids.dist", + progressBar = TRUE, + seed = 20) +plot(tune) + +## Second example: multilevel one-factor analysis with sPLS-DA + +data(vac18) +X = vac18$genes +Y = vac18$stimulation +# sample indicates the repeated measurements +design = data.frame(sample = vac18$sample) + +# tune on components +tune = tune.splsda(X, Y = Y, ncomp = 5, nrepeat = 10, logratio = "none", + test.keepX = NULL, folds = 10, dist = "max.dist", multilevel = design) + +plot(tune) + +# tune on variables +tune = tune.splsda(X, Y = Y, ncomp = 3, nrepeat = 10, logratio = "none", + test.keepX = c(5,50,100),folds = 10, dist = "max.dist", multilevel = design) + +plot(tune) \ No newline at end of file diff --git a/man/image.tune.rcc.Rd b/man/image.tune.rcc.Rd index c08786af..8729dae9 100644 --- a/man/image.tune.rcc.Rd +++ b/man/image.tune.rcc.Rd @@ -38,7 +38,7 @@ X <- nutrimouse$lipid Y <- nutrimouse$gene ## this can take some seconds -cv.score <- tune.rcc(X, Y, validation = "Mfold", plot = FALSE) +cv.score <- tune.rcc(X, Y, validation = "Mfold") plot(cv.score) # image(cv.score) # same result as plot() diff --git a/man/perf.Rd b/man/perf.Rd index 7f332e0f..2ce3eef8 100644 --- a/man/perf.Rd +++ b/man/perf.Rd @@ -185,19 +185,25 @@ if using standard (non-sparse) PLS.} \item{cor.tpred, cor.upred}{Correlation between the predicted and actual components for X (t) and Y (u)} \item{RSS.tpred, RSS.upred}{Residual Sum of Squares between the -predicted and actual components for X (t) and Y (u)} -\item{error.rate}{ For -PLS-DA and sPLS-DA models, \code{perf} produces a matrix of classification +predicted and actual components for X (t) and Y (u)} + + + +For PLS-DA and sPLS-DA models, \code{perf} produces a matrix of classification error rate estimation. The dimensions correspond to the components in the model and to the prediction method used, respectively. Note that error rates reported in any component include the performance of the model in earlier components for the specified \code{keepX} parameters (e.g. error rate reported for component 3 for \code{keepX = 20} already includes the fitted -model on components 1 and 2 for \code{keepX = 20}). For more advanced usage -of the \code{perf} function, see \url{www.mixomics.org/methods/spls-da/} and -consider using the \code{predict} function.} -\item{auc}{Averaged AUC values -over the \code{nrepeat}} +model on components 1 and 2 for \code{keepX = 20}). +\item{error.rate}{Prediction error rate for each dist and measure} +\item{auc}{AUC values per component averaged over the \code{nrepeat}} +\item{auc.all}{AUC values per component per repeat} +\item{predict}{A list of length ncomp that os predicted values of each sample for each class} +\item{features}{a list of features selected across the folds ($stable.X) for the keepX parameters from the input object.} +\item{choice.ncomp}{Otimal number of components for the model for each prediction distance using one-sided t-tests that test +for a significant difference in the mean error rate (gain in prediction) when components are added to the model.} +\item{class}{A list which gives the predicted class of each sample for each dist and each of the ncomp components} For mint.splsda models, \code{perf} produces the following outputs: \item{study.specific.error}{A list that gives BER, overall error rate and @@ -210,7 +216,7 @@ predicted class of each sample for each \code{dist} and each of the \item{auc}{AUC values} \item{auc.study}{AUC values for each study in mint models} -For sgccda models, \code{perf} produces the following outputs: +For sgccda models (i.e. block (s)PLS-DA models), \code{perf} produces the following outputs: \item{error.rate}{Prediction error rate for each block of \code{object$X} and each \code{dist}} \item{error.rate.per.class}{Prediction error rate for each block of \code{object$X}, each \code{dist} and each class} @@ -241,7 +247,8 @@ the predicted class for this particular sample over the blocks.} \item{WeightedVote.error.rate}{if more than one block, returns the error rate of the \code{WeightedVote} output} \item{weights}{Returns the weights of each block used for the weighted predictions, for each nrepeat and each -fold} \item{choice.ncomp}{For supervised models; returns the optimal number +fold} +\item{choice.ncomp}{For supervised models; returns the optimal number of components for the model for each prediction distance using one-sided t-tests that test for a significant difference in the mean error rate (gain in prediction) when components are added to the model. See more details in diff --git a/man/perf.assess.Rd b/man/perf.assess.Rd index 264a7361..84a7de9e 100644 --- a/man/perf.assess.Rd +++ b/man/perf.assess.Rd @@ -28,7 +28,7 @@ perf.assess(object, ...) ... ) -\method{perf}{assess.mint.plsda}( +\method{perf.assess}{mint.plsda}( object, dist = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"), auc = FALSE, @@ -37,7 +37,7 @@ perf.assess(object, ...) ... ) -\method{perf}{assess.mint.splsda}( +\method{perf.assess}{mint.splsda}( object, dist = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"), auc = FALSE, @@ -46,7 +46,7 @@ perf.assess(object, ...) ... ) -\method{perf}{assess.mixo_pls}( +\method{perf.assess}{mixo_pls}( object, validation = c("Mfold", "loo"), folds, @@ -57,7 +57,7 @@ perf.assess(object, ...) ... ) -\method{perf}{assess.mixo_spls}( +\method{perf.assess}{mixo_spls}( object, validation = c("Mfold", "loo"), folds, @@ -68,7 +68,7 @@ perf.assess(object, ...) ... ) -\method{perf}{assess.mixo_plsda}( +\method{perf.assess}{mixo_plsda}( object, dist = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"), validation = c("Mfold", "loo"), @@ -82,7 +82,7 @@ perf.assess(object, ...) ... ) -\method{perf}{assess.mixo_splsda}( +\method{perf.assess}{mixo_splsda}( object, dist = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"), validation = c("Mfold", "loo"), @@ -98,7 +98,7 @@ perf.assess(object, ...) } \arguments{ \item{object}{object of class inherited from \code{"pls"}, \code{"plsda"}, -\code{"spls"}, \code{"splsda"} or \code{"mint.splsda"}. The function will +\code{"spls"}, \code{"splsda"}. \code{"sgccda"} or \code{"mint.splsda"}. The function will retrieve some key parameters stored in that object.} \item{...}{not used} @@ -109,15 +109,15 @@ performance of the model. Should be a subset of \code{"max.dist"}, \code{"centroids.dist"}, \code{"mahalanobis.dist"}. Default is \code{"all"}. See \code{\link{predict}}.} -\item{validation}{character. What kind of (internal) validation to use, +\item{validation}{a character string. What kind of (internal) validation to use, matching one of \code{"Mfold"} or \code{"loo"} (see below). Default is -\code{"Mfold"}.} +\code{"Mfold"}. For MINT methods only \code{"loo"} will be used.} -\item{folds}{the folds in the Mfold cross-validation. See Details.} +\item{folds}{numeric. Number of folds in the Mfold cross-validation. See Details.} -\item{nrepeat}{Number of times the Cross-Validation process is repeated. +\item{nrepeat}{numierc. Number of times the Cross-Validation process is repeated. This is an important argument to ensure the estimation of the performance to -be as accurate as possible.} +be as accurate as possible. Default it 1.} \item{auc}{if \code{TRUE} calculate the Area Under the Curve (AUC) performance of the model.} @@ -137,15 +137,15 @@ Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM' Note 'seed' is not required or used in perf.mint.plsda as this method uses loo cross-validation} } \value{ -For PLS and sPLS models, \code{perf} produces a list with the -following components for every repeat: +For PLS and sPLS models: \item{MSEP}{Mean Square Error Prediction for each \eqn{Y} variable, only applies to object inherited from \code{"pls"}, and \code{"spls"}. Only available when in regression (s)PLS.} \item{RMSEP}{Root Mean Square Error Prediction for each \eqn{Y} variable, only applies to object inherited from \code{"pls"}, and \code{"spls"}. Only available when in regression (s)PLS.} -\item{R2}{a matrix of \eqn{R^2} values of the \eqn{Y}-variables. Only applies to object +\item{R2}{a matrix of \eqn{R^2} values of the \eqn{Y}-variables for models +with \eqn{1, \ldots ,}\code{ncomp} components, only applies to object inherited from \code{"pls"}, and \code{"spls"}. Only available when in regression (s)PLS.} \item{Q2}{if \eqn{Y} contains one variable, a vector of \eqn{Q^2} values @@ -153,41 +153,44 @@ else a list with a matrix of \eqn{Q^2} values for each \eqn{Y}-variable. Note that in the specific case of an sPLS model, it is better to have a look at the Q2.total criterion, only applies to object inherited from \code{"pls"}, and \code{"spls"}. Only available when in regression (s)PLS.} -\item{Q2.total}{a vector of \eqn{Q^2}-total values for model, only applies to object inherited from +\item{Q2.total}{a vector of \eqn{Q^2}-total values for models with \eqn{1, +\ldots ,}\code{ncomp} components, only applies to object inherited from \code{"pls"}, and \code{"spls"}. Available in both (s)PLS modes.} -\item{RSS}{Residual Sum of Squares across all selected features.} +\item{RSS}{Residual Sum of Squares across all selected features} \item{PRESS}{Predicted Residual Error Sum of Squares across all selected features} -\item{features}{a list of features selected across the -folds (\code{$stable.X} and \code{$stable.Y}) for the \code{keepX} and -\code{keepY} parameters from the input object. Note, this will be \code{NULL} -if using standard (non-sparse) PLS.} \item{cor.tpred, cor.upred}{Correlation between the predicted and actual components for X (t) and Y (u)} \item{RSS.tpred, RSS.upred}{Residual Sum of Squares between the -predicted and actual components for X (t) and Y (u)} -\item{error.rate}{ For -PLS-DA and sPLS-DA models, \code{perf} produces a matrix of classification -error rate estimation using overall and BER error rates across different distance methods. -Although error rates are only reported for the number of components used in the final model, -Note that are calculated including the performance of the model in a smaller number of -components for the specified \code{keepX} parameters (e.g. error rate -reported for component 3 for \code{keepX = 20} already includes the fitted -model on components 1 and 2 for \code{keepX = 20}). For more advanced usage -of the \code{perf} function, see \url{www.mixomics.org/methods/spls-da/} and -consider using the \code{predict} function.} -\item{auc}{Averaged AUC values -over the \code{nrepeat}} - -#' For sgccda models, \code{perf} produces the following outputs: +predicted and actual components for X (t) and Y (u)} + + + +For PLS-DA and sPLS-DA models: +\item{error.rate}{Prediction error rate for each dist and measure} +\item{auc}{AUC value averaged over the \code{nrepeat}} +\item{auc.all}{AUC values per repeat} +\item{predict}{Predicted values of each sample for each class} +\item{class}{A list which gives the predicted class of each sample for each dist and each of the ncomp components} + +For mint.splsda models: +\item{study.specific.error}{A list that gives BER, overall error rate and +error rate per class, for each study} +\item{global.error}{A list that gives +BER, overall error rate and error rate per class for all samples} +\item{predict}{A list of length \code{ncomp} that produces the predicted +values of each sample for each class} +\item{class}{A list which gives the +predicted class of each sample for each \code{dist}.} +\item{auc}{AUC values} \item{auc.study}{AUC values for each study in mint models} + +For sgccda models (i.e. block (s)PLS-DA models): \item{error.rate}{Prediction error rate for each block of \code{object$X} and each \code{dist}} \item{error.rate.per.class}{Prediction error rate for each block of \code{object$X}, each \code{dist} and each class} -\item{predict}{Predicted values of each sample for each class and each block.} -\item{class}{Predicted class of each sample for each block, each \code{dist}, and each nrepeat} -\item{features}{a list of features selected across the folds (\code{$stable.X} and -\code{$stable.Y}) for the \code{keepX} and \code{keepY} parameters from the -input object.} +\item{predict}{Predicted values of each sample for each class and each block} +\item{class}{Predicted class of each sample for each +block, each \code{dist}, and each nrepeat} \item{AveragedPredict.class}{if more than one block, returns the average predicted class over the blocks (averaged of the \code{Predict} output and prediction using the \code{max.dist} distance)} @@ -210,15 +213,7 @@ the predicted class for this particular sample over the blocks.} \item{WeightedVote.error.rate}{if more than one block, returns the error rate of the \code{WeightedVote} output} \item{weights}{Returns the weights of each block used for the weighted predictions, for each nrepeat and each -fold} - -For mint.splsda models, \code{perf} produces the following outputs: -\item{study.specific.error}{A list that gives BER, overall error rate and -error rate per class, for each study} -\item{global.error}{A list that gives BER, overall error rate and error rate per class for all samples} -\item{predict}{A list of the predicted values of each sample for each class} -\item{class}{A list which gives the predicted class of each sample for each \code{dist}. Directly obtained from the \code{predict} output.} -\item{auc}{AUC values} \item{auc.study}{AUC values for each study} +fold} } \description{ Function to evaluate the performance of the fitted PLS, sparse PLS, PLS-DA, @@ -256,16 +251,6 @@ MSEP, \eqn{R^2}, and \eqn{Q^2} criteria are averaged across all folds. Note that for PLS and sPLS objects, perf is performed on the pre-processed data after log ratio transform and multilevel analysis, if any. -Sparse methods. The sPLS, sPLS-DA and sgccda functions are run on several -and different subsets of data (the cross-folds) and will certainly lead to -different subset of selected features. Those are summarised in the output -\code{features$stable} (see output Value below) to assess how often the -variables are selected across all folds. Note that for PLS-DA and sPLS-DA -objects, perf is performed on the original data, i.e. before the -pre-processing step of the log ratio transform and multilevel analysis, if -any. In addition for these methods, the classification error rate is -averaged across all folds. - The mint.sPLS-DA function estimates errors based on Leave-one-group-out cross validation (where each levels of object$study is left out (and predicted) once) and provides study-specific outputs @@ -285,8 +270,7 @@ details. Our multivariate supervised methods already use a prediction threshold based on distances (see \code{predict}) that optimally determine class membership of the samples tested. As such AUC and ROC are not needed to estimate the performance of the model. We provide those outputs as -complementary performance measures. See more details in our mixOmics -article. +complementary performance measures. Prediction distances. See details from \code{?predict}, and also our supplemental material in the mixOmics article. @@ -309,141 +293,102 @@ given block with the outcome variable Y. More details about the PLS modes in \code{?pls}. } \examples{ -## validation for objects of class 'pls' (regression) -# ---------------------------------------- -data(liver.toxicity) -X <- liver.toxicity$gene -Y <- liver.toxicity$clinic - -# try tune the number of component to choose -# --------------------- -# first learn the full model -liver.pls <- pls(X, Y, ncomp = 5) - -# with 5-fold cross validation: we use the same parameters as in model above -# but we perform cross validation to compute the MSEP, Q2 and R2 criteria -# --------------------------- -liver.val <- perf(liver.pls, validation = "Mfold", folds = 5) - -# see available criteria -names(liver.val$measures) -# see values for all repeats -liver.val$measures$Q2.total$values -# see summary over repeats -liver.val$measures$Q2.total$summary -# Q2 total should decrease until it reaches a threshold -liver.val$measures$Q2.total - -# ncomp = 2 is enough -plot(liver.val, criterion = 'Q2.total') - -\dontrun{ - -# have a look at the other criteria -# ---------------------- -# R2 -plot(liver.val, criterion = 'R2') -## correlation of components (see docs) -plot(liver.val, criterion = 'cor.tpred') - -# MSEP -plot(liver.val, criterion = 'MSEP') -## validation for objects of class 'spls' (regression) -# ---------------------------------------- -ncomp = 7 -# first, learn the model on the whole data set -model.spls = spls(X, Y, ncomp = ncomp, mode = 'regression', - keepX = c(rep(10, ncomp)), keepY = c(rep(4,ncomp))) - - -# with leave-one-out cross validation -model.spls.val <- perf(model.spls, validation = "Mfold", folds = 5, seed = 45 ) - -#Q2 total -model.spls.val$measures$Q2$summary - -# R2: we can see how the performance degrades when ncomp increases -plot(model.spls.val, criterion="R2") - -## validation for objects of class 'splsda' (classification) -# ---------------------------------------- -data(srbct) -X <- srbct$gene -Y <- srbct$class - -ncomp = 2 - -srbct.splsda <- splsda(X, Y, ncomp = ncomp, keepX = rep(10, ncomp)) - -# with Mfold -# --------- -error <- perf(srbct.splsda, validation = "Mfold", folds = 8, -dist = "all", auc = TRUE, seed = 45) -error -error$auc - -plot(error) - -# parallel code -library(BiocParallel) -error <- perf(srbct.splsda, validation = "Mfold", folds = 8, -dist = "all", auc = TRUE, BPPARAM = SnowParam(workers = 2), seed = 45) - -# with 5 components and nrepeat=5, to get a $choice.ncomp -ncomp = 5 -srbct.splsda <- splsda(X, Y, ncomp = ncomp, keepX = rep(10, ncomp)) - -error <- perf(srbct.splsda, validation = "Mfold", folds = 8, -dist = "all", nrepeat = 5, seed = 45) -error$choice.ncomp - -plot(error) - - -## validation for objects of class 'mint.splsda' (classification) -# ---------------------------------------- - -data(stemcells) -res = mint.splsda(X = stemcells$gene, Y = stemcells$celltype, - ncomp = 3, keepX = c(10, 5, 15), - study = stemcells$study) - -out = perf(res, auc = TRUE) -out -plot(out) -out$auc -out$auc.study - -## validation for objects of class 'sgccda' (classification) -# ---------------------------------------- - -data(nutrimouse) -Y = nutrimouse$diet -data = list(gene = nutrimouse$gene, lipid = nutrimouse$lipid) - -nutrimouse.sgccda <- block.splsda(X=data, -Y = Y, -design = 'full', -keepX = list(gene=c(10,10), lipid=c(15,15)), -ncomp = 2) - -perf = perf(nutrimouse.sgccda) -perf -plot(perf) +## PLS-DA example +data(liver.toxicity) # rats gex and clinical measurements/treatments +unique(liver.toxicity$treatment$Treatment.Group) # 16 groups +length(liver.toxicity$treatment$Treatment.Group) # 64 samples -# with 5 components and nrepeat=5 to get $choice.ncomp -nutrimouse.sgccda <- block.splsda(X=data, -Y = Y, -design = 'full', -keepX = list(gene=c(10,10), lipid=c(15,15)), -ncomp = 5) +plsda.res <- plsda(liver.toxicity$gene, liver.toxicity$treatment$Treatment.Group, ncomp = 2) -perf = perf(nutrimouse.sgccda, folds = 5, nrepeat = 5) -perf -plot(perf) -perf$choice.ncomp -} +performance <- perf.assess(plsda.res, + # to make sure each fold has all classes represented + validation = "Mfold", folds = 3, nrepeat = 10, + seed = 12) # for reproducibility, remove for analysis + +performance$error.rate$BER + +## sPLS-DA example + +splsda.res <- splsda(liver.toxicity$gene, liver.toxicity$treatment$Treatment.Group, + keepX = c(25, 25), ncomp = 2) + +performance <- perf.assess(splsda.res, + validation = "Mfold", folds = 3, nrepeat = 10, + seed = 12) + +performance$error.rate$BER # can see slight improvement in error rate over PLS-DA example + +## PLS example + +ncol(liver.toxicity$clinic) # 10 Y variables as output of PLS model + +pls.res <- pls(liver.toxicity$gene, liver.toxicity$clinic, ncomp = 2) + +performance <- perf.assess(pls.res, + validation = "Mfold", folds = 3, nrepeat = 10, + seed = 12) + +# see Q2 which gives indication of predictive ability for each of the 10 Y outputs +performance$measures$Q2$summary + +## sPLS example + +spls.res <- spls(liver.toxicity$gene, liver.toxicity$clinic, ncomp = 2, keepX = c(50, 50)) + +performance <- perf.assess(spls.res, + validation = "Mfold", folds = 3, nrepeat = 10, + seed = 12) + +# see Q2 which gives indication of predictive ability for each of the 10 Y outputs +performance$measures$Q2$summary + + +## block PLS-DA example + +data("breast.TCGA") +mrna <- breast.TCGA$data.train$mrna +mirna <- breast.TCGA$data.train$mirna +data <- list(mrna = mrna, mirna = mirna) +design <- matrix(1, ncol = length(data), nrow = length(data), dimnames = list(names(data), names(data))) +diag(design) <- 0 + +block.plsda.res <- block.plsda(X = data, Y = breast.TCGA$data.train$subtype, + ncomp = 2, design = design) + +performance <- perf.assess(block.plsda.res) + +performance$error.rate.per.class # error rate per class per distance metric + +## block sPLS-DA example + +block.splsda.res <- block.splsda(X = data, Y = breast.TCGA$data.train$subtype, + ncomp = 2, design = design, + keepX = list(mrna = c(8,8), mirna = c(8,8))) + +performance <- perf.assess(block.splsda.res) + +performance$error.rate.per.class + +## MINT PLS-DA example + +data("stemcells") + +mint.plsda.res <- mint.plsda(X = stemcells$gene, Y = stemcells$celltype, ncomp = 3, + study = stemcells$study) + +performance <- perf.assess(mint.plsda.res) + +performance$global.error$BER # global error per distance metric + +## MINT sPLS-DA example + +mint.splsda.res <- mint.splsda(X = stemcells$gene, Y = stemcells$celltype, ncomp = 3, + keepX = c(10, 5, 15), study = stemcells$study) + +performance <- perf.assess(mint.splsda.res) + +performance$global.error$BER # error slightly higher in this sparse model verses non-sparse } \references{ Singh A., Shannon C., Gautier B., Rohart F., Vacher M., Tebbutt S. diff --git a/man/tune.Rd b/man/tune.Rd index d77fa96b..e9fe06fa 100644 --- a/man/tune.Rd +++ b/man/tune.Rd @@ -5,7 +5,8 @@ \title{Wrapper function to tune pls-derived methods.} \usage{ tune( - method = c("spls", "splsda", "block.splsda", "mint.splsda", "rcc", "pca", "spca"), + method = c("pls", "spls", "plsda", "splsda", "block.plsda", "block.splsda", + "mint.plsda", "mint.splsda", "rcc", "pca", "spca"), X, Y, test.keepX = c(5, 10, 15), @@ -13,33 +14,32 @@ tune( already.tested.X, already.tested.Y, ncomp, - mode = c("regression", "canonical", "invariant", "classic"), - nrepeat = 1, - folds = 10, - validation = "Mfold", - max.iter = 100, - tol = 1e-09, - signif.threshold = 0.01, - logratio = c("none", "CLR"), V, center = TRUE, + grid1 = seq(0.001, 1, length = 5), + grid2 = seq(0.001, 1, length = 5), + mode = c("regression", "canonical", "invariant", "classic"), + indY, + weighted = TRUE, + design, + study, + tol = 1e-09, scale = TRUE, + logratio = c("none", "CLR"), near.zero.var = FALSE, + max.iter = 100, + multilevel = NULL, + validation = "Mfold", + nrepeat = 1, + folds = 10, + signif.threshold = 0.01, dist = "max.dist", measure = ifelse(method == "spls", "cor", "BER"), - multilevel = NULL, + auc = FALSE, seed = NULL, BPPARAM = SerialParam(), progressBar = FALSE, - auc = FALSE, - light.output = TRUE, - plot = FALSE, - indY, - weighted = TRUE, - design, - grid1 = seq(0.001, 1, length = 5), - grid2 = seq(0.001, 1, length = 5), - study + light.output = TRUE ) } \arguments{ @@ -69,35 +69,42 @@ data set on the first components} \item{ncomp}{the number of components to include in the model.} +\item{V}{Matrix used in the logratio transformation id provided (for tune.pca)} + +\item{center}{a logical value indicating whether the variables should be +shifted to be zero centered. Alternately, a vector of length equal the +number of columns of \code{X} can be supplied. The value is passed to +\code{\link{scale}}.} + +\item{grid1, grid2}{vector numeric defining the values of \code{lambda1} and +\code{lambda2} at which cross-validation score should be computed. Defaults +to \code{grid1=grid2=seq(0.001, 1, length=5)}.} + \item{mode}{character string. What type of algorithm to use, (partially) matching one of \code{"regression"}, \code{"canonical"}, \code{"invariant"} or \code{"classic"}. See Details.} -\item{nrepeat}{Number of times the Cross-Validation process is repeated.} +\item{indY}{To supply if \code{Y} is missing, indicates the position of +the matrix response in the list \code{X}.} -\item{folds}{the folds in the Mfold cross-validation. See Details.} +\item{weighted}{tune using either the performance of the Majority vote or +the Weighted vote.} -\item{validation}{character. What kind of (internal) validation to use, -matching one of \code{"Mfold"} or \code{"loo"} (see below). Default is -\code{"Mfold"}.} +\item{design}{numeric matrix of size (number of blocks in X) x (number of +blocks in X) with values between 0 and 1. Each value indicates the strenght +of the relationship to be modelled between two blocks; a value of 0 +indicates no relationship, 1 is the maximum value. Alternatively, one of +c('null', 'full') indicating a disconnected or fully connected design, +respecively, or a numeric between 0 and 1 which will designate all +off-diagonal elements of a fully connected design (see examples in +\code{block.splsda}). If \code{Y} is provided instead of \code{indY}, the +\code{design} matrix is changed to include relationships to \code{Y}.} -\item{max.iter}{Integer, the maximum number of iterations.} +\item{study}{grouping factor indicating which samples are from the same +study} \item{tol}{Numeric, convergence tolerance criteria.} -\item{signif.threshold}{numeric between 0 and 1 indicating the significance -threshold required for improvement in error rate of the components. Default -to 0.01.} - -\item{logratio}{one of ('none','CLR'). Default to 'none'} - -\item{V}{Matrix used in the logratio transformation id provided (for tune.pca)} - -\item{center}{a logical value indicating whether the variables should be -shifted to be zero centered. Alternately, a vector of length equal the -number of columns of \code{X} can be supplied. The value is passed to -\code{\link{scale}}.} - \item{scale}{a logical value indicating whether the variables should be scaled to have unit variance before the analysis takes place. The default is \code{FALSE} for consistency with \code{prcomp} function, but in general @@ -105,19 +112,38 @@ scaling is advisable. Alternatively, a vector of length equal the number of columns of \code{X} can be supplied. The value is passed to \code{\link{scale}}.} +\item{logratio}{one of ('none','CLR'). Default to 'none'} + \item{near.zero.var}{Logical, see the internal \code{\link{nearZeroVar}} function (should be set to TRUE in particular for data with many zero values). Default value is FALSE} +\item{max.iter}{Integer, the maximum number of iterations.} + +\item{multilevel}{Design matrix for multilevel analysis (for repeated +measurements) that indicates the repeated measures on each individual, i.e. +the individuals ID. See Details.} + +\item{validation}{character. What kind of (internal) validation to use, +matching one of \code{"Mfold"} or \code{"loo"} (see below). Default is +\code{"Mfold"}.} + +\item{nrepeat}{Number of times the Cross-Validation process is repeated.} + +\item{folds}{the folds in the Mfold cross-validation. See Details.} + +\item{signif.threshold}{numeric between 0 and 1 indicating the significance +threshold required for improvement in error rate of the components. Default +to 0.01.} + \item{dist}{distance metric to estimate the classification error rate, should be a subset of \code{"centroids.dist"}, \code{"mahalanobis.dist"} or \code{"max.dist"} (see Details).} \item{measure}{The tuning measure used for different methods. See details.} -\item{multilevel}{Design matrix for multilevel analysis (for repeated -measurements) that indicates the repeated measures on each individual, i.e. -the individuals ID. See Details.} +\item{auc}{if \code{TRUE} calculate the Area Under the Curve (AUC) +performance of the model.} \item{seed}{set a number here if you want the function to give reproducible outputs. Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'.} @@ -128,37 +154,8 @@ of parallelisation. See examples.} \item{progressBar}{by default set to \code{TRUE} to output the progress bar of the computation.} -\item{auc}{if \code{TRUE} calculate the Area Under the Curve (AUC) -performance of the model.} - \item{light.output}{if set to FALSE, the prediction/classification of each sample for each of \code{test.keepX} and each comp is returned.} - -\item{plot}{logical argument indicating whether an image map should be -plotted by calling the \code{imgCV} function. (for tune.rcc)} - -\item{indY}{To supply if \code{Y} is missing, indicates the position of -the matrix response in the list \code{X}.} - -\item{weighted}{tune using either the performance of the Majority vote or -the Weighted vote.} - -\item{design}{numeric matrix of size (number of blocks in X) x (number of -blocks in X) with values between 0 and 1. Each value indicates the strenght -of the relationship to be modelled between two blocks; a value of 0 -indicates no relationship, 1 is the maximum value. Alternatively, one of -c('null', 'full') indicating a disconnected or fully connected design, -respecively, or a numeric between 0 and 1 which will designate all -off-diagonal elements of a fully connected design (see examples in -\code{block.splsda}). If \code{Y} is provided instead of \code{indY}, the -\code{design} matrix is changed to include relationships to \code{Y}.} - -\item{grid1, grid2}{vector numeric defining the values of \code{lambda1} and -\code{lambda2} at which cross-validation score should be computed. Defaults -to \code{grid1=grid2=seq(0.001, 1, length=5)}.} - -\item{study}{grouping factor indicating which samples are from the same -study} } \value{ Depending on the type of analysis performed and the input arguments, diff --git a/man/tune.block.plsda.Rd b/man/tune.block.plsda.Rd new file mode 100644 index 00000000..d2401355 --- /dev/null +++ b/man/tune.block.plsda.Rd @@ -0,0 +1,243 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tune.block.plsda.R +\name{tune.block.plsda} +\alias{tune.block.plsda} +\title{Tuning function for block.plsda method (N-integration with Discriminant Analysis)} +\usage{ +tune.block.plsda( + X, + Y, + indY, + ncomp = 2, + tol = 1e-06, + max.iter = 100, + near.zero.var = FALSE, + design, + scale = TRUE, + validation = "Mfold", + folds = 10, + nrepeat = 1, + signif.threshold = 0.01, + dist = "all", + weighted = TRUE, + progressBar = FALSE, + light.output = TRUE, + BPPARAM = SerialParam(), + seed = NULL, + ... +) +} +\arguments{ +\item{X}{A named list of data sets (called 'blocks') measured on the same +samples. Data in the list should be arranged in matrices, samples x variables, +with samples order matching in all data sets.} + +\item{Y}{a factor or a class vector for the discrete outcome.} + +\item{indY}{To supply if \code{Y} is missing, indicates the position of +the matrix response in the list \code{X}.} + +\item{ncomp}{the number of components to include in the model. Default to 2. +Applies to all blocks.} + +\item{tol}{Positive numeric used as convergence criteria/tolerance during the +iterative process. Default to \code{1e-06}.} + +\item{max.iter}{Integer, the maximum number of iterations. Default to 100.} + +\item{near.zero.var}{Logical, see the internal \code{\link{nearZeroVar}} +function (should be set to TRUE in particular for data with many zero +values). Setting this argument to FALSE (when appropriate) will speed up the +computations. Default value is FALSE.} + +\item{design}{numeric matrix of size (number of blocks in X) x (number of +blocks in X) with values between 0 and 1. Each value indicates the strenght +of the relationship to be modelled between two blocks; a value of 0 +indicates no relationship, 1 is the maximum value. Alternatively, one of +c('null', 'full') indicating a disconnected or fully connected design, +respecively, or a numeric between 0 and 1 which will designate all +off-diagonal elements of a fully connected design (see examples in +\code{block.splsda}). If \code{Y} is provided instead of \code{indY}, the +\code{design} matrix is changed to include relationships to \code{Y}.} + +\item{scale}{Logical. If scale = TRUE, each block is standardized to zero +means and unit variances (default: TRUE)} + +\item{validation}{character. What kind of (internal) validation to use, +matching one of \code{"Mfold"} or \code{"loo"} (see below). Default is +\code{"Mfold"}.} + +\item{folds}{the folds in the Mfold cross-validation. See Details.} + +\item{nrepeat}{Number of times the Cross-Validation process is repeated.} + +\item{signif.threshold}{numeric between 0 and 1 indicating the significance +threshold required for improvement in error rate of the components. Default +to 0.01.} + +\item{dist}{Distance metric. Should be a subset of "max.dist", "centroids.dist", "mahalanobis.dist" or "all". +Default is "all"} + +\item{weighted}{tune using either the performance of the Majority vote or +the Weighted vote.} + +\item{progressBar}{by default set to \code{TRUE} to output the progress bar +of the computation.} + +\item{light.output}{if set to FALSE, the prediction/classification of each +sample for each of \code{test.keepX} and each comp is returned.} + +\item{BPPARAM}{A \linkS4class{BiocParallelParam} object indicating the type +of parallelisation. See examples.} + +\item{seed}{set a number here if you want the function to give reproducible outputs. +Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'.} + +\item{...}{Optional arguments: +\itemize{ + \item \bold{seed} Integer. Seed number for reproducible parallel code. + Default is \code{NULL}. +} +run in parallel when repeating the cross-validation, which is usually the +most computationally intensive process. If there is excess CPU, the +cross-vaidation is also parallelised on *nix-based OS which support +\code{mclapply}. +Note that the argument 'scheme' has now been hardcoded to 'horst' and 'init' to 'svd.single'.} +} +\value{ +returns: +\item{error.rate}{Prediction error rate for each block of \code{object$X} +and each \code{dist}} \item{error.rate.per.class}{Prediction error rate for +each block of \code{object$X}, each \code{dist} and each class} +\item{predict}{Predicted values of each sample for each class, each block +and each component} \item{class}{Predicted class of each sample for each +block, each \code{dist}, each component and each nrepeat} \item{features}{a +list of features selected across the folds (\code{$stable.X} and +\code{$stable.Y}) for the \code{keepX} and \code{keepY} parameters from the +input object.} \item{AveragedPredict.class}{if more than one block, returns +the average predicted class over the blocks (averaged of the \code{Predict} +output and prediction using the \code{max.dist} distance)} +\item{AveragedPredict.error.rate}{if more than one block, returns the +average predicted error rate over the blocks (using the +\code{AveragedPredict.class} output)} \item{WeightedPredict.class}{if more +than one block, returns the weighted predicted class over the blocks +(weighted average of the \code{Predict} output and prediction using the +\code{max.dist} distance). See details for more info on weights.} +\item{WeightedPredict.error.rate}{if more than one block, returns the +weighted average predicted error rate over the blocks (using the +\code{WeightedPredict.class} output.)} \item{MajorityVote}{if more than one +block, returns the majority class over the blocks. NA for a sample means that +there is no consensus on the predicted class for this particular sample over +the blocks.} \item{MajorityVote.error.rate}{if more than one block, returns +the error rate of the \code{MajorityVote} output} +\item{WeightedVote}{if more than one block, returns the weighted majority +class over the blocks. NA for a sample means that there is no consensus on +the predicted class for this particular sample over the blocks.} +\item{WeightedVote.error.rate}{if more than one block, returns the error +rate of the \code{WeightedVote} output} \item{weights}{Returns the weights +of each block used for the weighted predictions, for each nrepeat and each +fold} \item{choice.ncomp}{For supervised models; returns the optimal number +of components for the model for each prediction distance using one-sided +t-tests that test for a significant difference in the mean error rate (gain +in prediction) when components are added to the model. See more details in +Rohart et al 2017 Suppl. For more than one block, an optimal ncomp is +returned for each prediction framework.} +} +\description{ +Computes M-fold or Leave-One-Out Cross-Validation scores based on a +user-input grid to determine the optimal parameters for +method \code{block.plsda}. +} +\details{ +This tuning function should be used to tune the number of components in the \code{block.plsda} function (N-integration with Discriminant Analysis). + +M-fold or LOO cross-validation is performed with stratified subsampling +where all classes are represented in each fold. + +If \code{validation = "Mfold"}, M-fold cross-validation is performed. The +number of folds to generate is to be specified in the argument \code{folds}. + +If \code{validation = "loo"}, leave-one-out cross-validation is performed. +By default \code{folds} is set to the number of unique individuals. + +More details about the prediction distances in \code{?predict} and the +supplemental material of the mixOmics article (Rohart et al. 2017). Details +about the PLS modes are in \code{?pls}. + +BER is appropriate in case of an unbalanced number of samples per class as +it calculates the average proportion of wrongly classified samples in each +class, weighted by the number of samples in each class. BER is less biased +towards majority classes during the performance assessment. +} +\examples{ +data("breast.TCGA") + +# X data - list of mRNA and miRNA +X <- list(mrna = breast.TCGA$data.train$mrna, mirna = breast.TCGA$data.train$mirna, + protein = breast.TCGA$data.train$protein) + +# Y data - single data set of proteins +Y <- breast.TCGA$data.train$subtype + +# subset the X and Y data to speed up computation in this example +set.seed(100) +subset <- mixOmics:::stratified.subsampling(breast.TCGA$data.train$subtype, folds = 3)[[1]][[1]] +X <- lapply(X, function(omic) omic[subset,]) +Y <- Y[subset] + +# set up a full design where every block is connected +# could also consider other weights, see our mixOmics manuscript +design = matrix(1, ncol = length(X), nrow = length(X), + dimnames = list(names(X), names(X))) +diag(design) = 0 +design + +## Tune number of components to keep - use all distance metrics +tune_res <- tune.block.plsda(X, Y, design = design, + ncomp = 5, + nrepeat = 3, + seed = 13, + dist = c("all")) + +plot(tune_res) +tune_res$choice.ncomp # 3 components best for max.dist, 1 for centroids.dist + + +## Tune number of components to keep - use weighted vote rather than majority vote +tune_res <- tune.block.plsda(X, Y, design = design, + ncomp = 5, + nrepeat = 3, + seed = 13, + dist = c("all"), + weighted = FALSE) +tune_res$weights + +## Tune number of components to keep - plot just max.dist +tune_res <- tune.block.plsda(X, Y, design = design, + ncomp = 5, + nrepeat = 3, + seed = 13, + dist = c("max.dist")) +plot(tune_res) +} +\references{ +Method: + +Singh A., Gautier B., Shannon C., Vacher M., Rohart F., Tebbutt S. and Lê +Cao K.A. (2016). DIABLO: multi omics integration for biomarker discovery. + +mixOmics article: + +Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics +feature selection and multiple data integration. PLoS Comput Biol 13(11): +e1005752 +} +\seealso{ +\code{\link{block.splsda}} and http://www.mixOmics.org for more +details. +} +\author{ +Florian Rohart, Amrit Singh, Kim-Anh Lê Cao, AL J Abadi +} +\keyword{multivariate} +\keyword{regression} diff --git a/man/tune.block.splsda.Rd b/man/tune.block.splsda.Rd index f36a7cbb..0b365683 100644 --- a/man/tune.block.splsda.Rd +++ b/man/tune.block.splsda.Rd @@ -10,25 +10,24 @@ tune.block.splsda( Y, indY, ncomp = 2, + tol = 1e-06, + max.iter = 100, + near.zero.var = FALSE, + design, + scale = TRUE, test.keepX, already.tested.X, validation = "Mfold", folds = 10, + nrepeat = 1, + signif.threshold = 0.01, dist = "max.dist", measure = "BER", weighted = TRUE, progressBar = FALSE, - tol = 1e-06, - max.iter = 100, - near.zero.var = FALSE, - nrepeat = 1, - design, - scale = TRUE, light.output = TRUE, - signif.threshold = 0.01, BPPARAM = SerialParam(), - seed = NULL, - ... + seed = NULL ) } \arguments{ @@ -44,34 +43,6 @@ the matrix response in the list \code{X}.} \item{ncomp}{the number of components to include in the model. Default to 2. Applies to all blocks.} -\item{test.keepX}{A named list with the same length and names as X -(without the outcome Y, if it is provided in X and designated using -\code{indY}). Each entry of this list is a numeric vector for the different -keepX values to test for that specific block.} - -\item{already.tested.X}{Optional, if \code{ncomp > 1} A named list of -numeric vectors each of length \code{n_tested} indicating the number of -variables to select from the \eqn{X} data set on the first \code{n_tested} -components.} - -\item{validation}{character. What kind of (internal) validation to use, -matching one of \code{"Mfold"} or \code{"loo"} (see below). Default is -\code{"Mfold"}.} - -\item{folds}{the folds in the Mfold cross-validation. See Details.} - -\item{dist}{distance metric to estimate the -classification error rate, should be a subset of \code{"centroids.dist"}, -\code{"mahalanobis.dist"} or \code{"max.dist"} (see Details).} - -\item{measure}{The tuning measure used for different methods. See details.} - -\item{weighted}{tune using either the performance of the Majority vote or -the Weighted vote.} - -\item{progressBar}{by default set to \code{TRUE} to output the progress bar -of the computation.} - \item{tol}{Positive numeric used as convergence criteria/tolerance during the iterative process. Default to \code{1e-06}.} @@ -82,8 +53,6 @@ function (should be set to TRUE in particular for data with many zero values). Setting this argument to FALSE (when appropriate) will speed up the computations. Default value is FALSE.} -\item{nrepeat}{Number of times the Cross-Validation process is repeated.} - \item{design}{numeric matrix of size (number of blocks in X) x (number of blocks in X) with values between 0 and 1. Each value indicates the strenght of the relationship to be modelled between two blocks; a value of 0 @@ -97,29 +66,49 @@ off-diagonal elements of a fully connected design (see examples in \item{scale}{Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE)} -\item{light.output}{if set to FALSE, the prediction/classification of each -sample for each of \code{test.keepX} and each comp is returned.} +\item{test.keepX}{A named list with the same length and names as X +(without the outcome Y, if it is provided in X and designated using +\code{indY}). Each entry of this list is a numeric vector for the different +keepX values to test for that specific block. If set to NULL, ncomp is tuned.} + +\item{already.tested.X}{Optional, if \code{ncomp > 1} A named list of +numeric vectors each of length \code{n_tested} indicating the number of +variables to select from the \eqn{X} data set on the first \code{n_tested} +components.} + +\item{validation}{character. What kind of (internal) validation to use, +matching one of \code{"Mfold"} or \code{"loo"} (see below). Default is +\code{"Mfold"}.} + +\item{folds}{the folds in the Mfold cross-validation. See Details.} + +\item{nrepeat}{Number of times the Cross-Validation process is repeated.} \item{signif.threshold}{numeric between 0 and 1 indicating the significance threshold required for improvement in error rate of the components. Default to 0.01.} +\item{dist}{distance metric to estimate the classification error rate, should be one of +"centroids.dist", "mahalanobis.dist" or "max.dist" (see Details). If \code{test.keepX = NULL}, +can also input "all" or more than one distance metric} + +\item{measure}{only used when \code{test.keepX} is not NULL. Measure used when plotting, +should be 'BER' or 'overall'} + +\item{weighted}{tune using either the performance of the Majority vote or +the Weighted vote.} + +\item{progressBar}{by default set to \code{TRUE} to output the progress bar +of the computation.} + +\item{light.output}{if set to FALSE, the prediction/classification of each +sample for each of \code{test.keepX} and each comp is returned.} + \item{BPPARAM}{A \linkS4class{BiocParallelParam} object indicating the type of parallelisation. See examples.} \item{seed}{set a number here if you want the function to give reproducible outputs. Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'.} - -\item{...}{Optional arguments: -\itemize{ - \item \bold{seed} Integer. Seed number for reproducible parallel code. - Default is \code{NULL}. -} -run in parallel when repeating the cross-validation, which is usually the -most computationally intensive process. If there is excess CPU, the -cross-vaidation is also parallelised on *nix-based OS which support -\code{mclapply}. -Note that the argument 'scheme' has now been hardcoded to 'horst' and 'init' to 'svd.single'.} } \value{ A list that contains: \item{error.rate}{returns the prediction error @@ -139,15 +128,53 @@ comp and each repeat. Only if light.output=FALSE} \item{cor.value}{compute the correlation between latent variables for two-factor sPLS-DA analysis.} + +If \code{test.keepX = NULL}, returns: +\item{error.rate}{Prediction error rate for each block of \code{object$X} +and each \code{dist}} \item{error.rate.per.class}{Prediction error rate for +each block of \code{object$X}, each \code{dist} and each class} +\item{predict}{Predicted values of each sample for each class, each block +and each component} \item{class}{Predicted class of each sample for each +block, each \code{dist}, each component and each nrepeat} \item{features}{a +list of features selected across the folds (\code{$stable.X} and +\code{$stable.Y}) for the \code{keepX} and \code{keepY} parameters from the +input object.} \item{AveragedPredict.class}{if more than one block, returns +the average predicted class over the blocks (averaged of the \code{Predict} +output and prediction using the \code{max.dist} distance)} +\item{AveragedPredict.error.rate}{if more than one block, returns the +average predicted error rate over the blocks (using the +\code{AveragedPredict.class} output)} \item{WeightedPredict.class}{if more +than one block, returns the weighted predicted class over the blocks +(weighted average of the \code{Predict} output and prediction using the +\code{max.dist} distance). See details for more info on weights.} +\item{WeightedPredict.error.rate}{if more than one block, returns the +weighted average predicted error rate over the blocks (using the +\code{WeightedPredict.class} output.)} \item{MajorityVote}{if more than one +block, returns the majority class over the blocks. NA for a sample means that +there is no consensus on the predicted class for this particular sample over +the blocks.} \item{MajorityVote.error.rate}{if more than one block, returns +the error rate of the \code{MajorityVote} output} +\item{WeightedVote}{if more than one block, returns the weighted majority +class over the blocks. NA for a sample means that there is no consensus on +the predicted class for this particular sample over the blocks.} +\item{WeightedVote.error.rate}{if more than one block, returns the error +rate of the \code{WeightedVote} output} \item{weights}{Returns the weights +of each block used for the weighted predictions, for each nrepeat and each +fold} \item{choice.ncomp}{For supervised models; returns the optimal number +of components for the model for each prediction distance using one-sided +t-tests that test for a significant difference in the mean error rate (gain +in prediction) when components are added to the model. See more details in +Rohart et al 2017 Suppl. For more than one block, an optimal ncomp is +returned for each prediction framework.} } \description{ Computes M-fold or Leave-One-Out Cross-Validation scores based on a -user-input grid to determine the optimal parsity parameters values for +user-input grid to determine the optimal parameters for method \code{block.splsda}. } \details{ -This tuning function should be used to tune the keepX parameters in the -\code{block.splsda} function (N-integration with sparse Discriminant +This tuning function should be used to tune the number of components and the +keepX parameters in the \code{block.splsda} function (N-integration with sparse Discriminant Analysis). M-fold or LOO cross-validation is performed with stratified subsampling @@ -172,64 +199,73 @@ class, weighted by the number of samples in each class. BER is less biased towards majority classes during the performance assessment. } \examples{ -\dontrun{ +## Set up data + +# load data data("breast.TCGA") -# this is the X data as a list of mRNA and miRNA; the Y data set is a single data set of proteins -data = list(mrna = breast.TCGA$data.train$mrna, mirna = breast.TCGA$data.train$mirna, - protein = breast.TCGA$data.train$protein) + +# X data - list of mRNA and miRNA +X <- list(mrna = breast.TCGA$data.train$mrna, mirna = breast.TCGA$data.train$mirna, + protein = breast.TCGA$data.train$protein) + +# Y data - single data set of proteins +Y <- breast.TCGA$data.train$subtype + +# subset the X and Y data to speed up computation in this example +set.seed(100) +subset <- mixOmics:::stratified.subsampling(breast.TCGA$data.train$subtype, folds = 3)[[1]][[1]] +X <- lapply(X, function(omic) omic[subset,]) +Y <- Y[subset] + # set up a full design where every block is connected # could also consider other weights, see our mixOmics manuscript -design = matrix(1, ncol = length(data), nrow = length(data), - dimnames = list(names(data), names(data))) +design = matrix(1, ncol = length(X), nrow = length(X), + dimnames = list(names(X), names(X))) diag(design) = 0 design -# set number of component per data set -ncomp = 3 -# Tuning the first two components -# ------------- -## Not run: +## Tune number of components to keep +tune_res <- tune.block.splsda(X, Y, design = design, + ncomp = 5, + test.keepX = NULL, + validation = "Mfold", nrepeat = 3, + dist = "all", measure = "BER", + seed = 13) + +plot(tune_res) + +tune_res$choice.ncomp # 3 components best + +## Tune number of variables to keep + # definition of the keepX value to be tested for each block mRNA miRNA and protein # names of test.keepX must match the names of 'data' test.keepX = list(mrna = c(10, 30), mirna = c(15, 25), protein = c(4, 8)) -# the following may take some time to run, so we subset the data first. -# Note that for thorough tuning, nrepeat should be >= 3 so that significance of -# the model improvement can be measured -## ---- subset by 3rd of samples -set.seed(100) -subset <- mixOmics:::stratified.subsampling(breast.TCGA$data.train$subtype, folds = 3)[[1]][[1]] -data <- lapply(data, function(omic) omic[subset,]) -Y <- breast.TCGA$data.train$subtype[subset] -## ---- run -# Check if the environment variable exists (during R CMD check) and limit cores accordingly -max_cores <- if (Sys.getenv("_R_CHECK_LIMIT_CORES_") != "") 2 else parallel::detectCores() - 1 -# Setup the parallel backend with the appropriate number of workers -BPPARAM <- BiocParallel::MulticoreParam(workers = max_cores) - -tune <- tune.block.splsda( - X = data, - Y = Y, - ncomp = ncomp, - test.keepX = test.keepX, - design = design, - nrepeat = 2, - BPPARAM = BPPARAM -) +# load parallel package +library(BiocParallel) -plot(tune) -tune$choice.ncomp -tune$choice.keepX +# run tuning in parallel on 2 cores, output plot on overall error +tune_res <- tune.block.splsda(X, Y, design = design, + ncomp = 2, + test.keepX = test.keepX, + validation = "Mfold", nrepeat = 3, + measure = "overall", + seed = 13, BPPARAM = SnowParam(workers = 2)) + +plot(tune_res) +tune_res$choice.keepX # Now tuning a new component given previous tuned keepX -already.tested.X = tune$choice.keepX -tune = tune.block.splsda(X = data, Y = Y, - ncomp = 4, test.keepX = test.keepX, design = design, - already.tested.X = already.tested.X, - BPPARAM = BPPARAM - ) -tune$choice.keepX -} +already.tested.X <- tune_res$choice.keepX +tune_res <- tune.block.splsda(X, Y, design = design, + ncomp = 3, + test.keepX = test.keepX, + validation = "Mfold", nrepeat = 3, + measure = "overall", + seed = 13, BPPARAM = SnowParam(workers = 2), + already.tested.X = already.tested.X) +tune_res$choice.keepX } \references{ Method: diff --git a/man/tune.mint.plsda.Rd b/man/tune.mint.plsda.Rd new file mode 100644 index 00000000..75df5ec3 --- /dev/null +++ b/man/tune.mint.plsda.Rd @@ -0,0 +1,138 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tune.mint.plsda.R +\name{tune.mint.plsda} +\alias{tune.mint.plsda} +\title{Estimate the parameters of mint.plsda method} +\usage{ +tune.mint.plsda( + X, + Y, + ncomp = 1, + study, + scale = TRUE, + tol = 1e-06, + max.iter = 100, + near.zero.var = FALSE, + signif.threshold = 0.01, + dist = c("max.dist", "centroids.dist", "mahalanobis.dist"), + auc = FALSE, + progressBar = FALSE, + light.output = TRUE +) +} +\arguments{ +\item{X}{numeric matrix of predictors. \code{NA}s are allowed.} + +\item{Y}{Outcome. Numeric vector or matrix of responses (for multi-response +models)} + +\item{ncomp}{Number of components to include in the model (see Details). +Default to 1} + +\item{study}{grouping factor indicating which samples are from the same +study} + +\item{scale}{Logical. If scale = TRUE, each block is standardized to zero +means and unit variances (default: TRUE)} + +\item{tol}{Convergence stopping value.} + +\item{max.iter}{integer, the maximum number of iterations.} + +\item{near.zero.var}{Logical, see the internal \code{\link{nearZeroVar}} +function (should be set to TRUE in particular for data with many zero +values). Default value is FALSE} + +\item{signif.threshold}{numeric between 0 and 1 indicating the significance +threshold required for improvement in error rate of the components. Default +to 0.01.} + +\item{dist}{only applies to an object inheriting from \code{"plsda"} or +\code{"splsda"} to evaluate the classification performance of the model. +Should be a subset of \code{"max.dist"}, \code{"centroids.dist"}, +\code{"mahalanobis.dist"}. Default is \code{"all"}. See +\code{\link{predict}}.} + +\item{auc}{if \code{TRUE} calculate the Area Under the Curve (AUC) +performance of the model.} + +\item{progressBar}{by default set to \code{TRUE} to output the progress bar +of the computation.} + +\item{light.output}{if set to FALSE, the prediction/classification of each +sample for each of \code{test.keepX} and each comp is returned.} +} +\value{ +The returned value is a list with components: +\item{study.specific.error}{A list that gives BER, overall error rate and +error rate per class, for each study} \item{global.error}{A list that gives +BER, overall error rate and error rate per class for all samples} +\item{predict}{A list of length \code{ncomp} that produces the predicted +values of each sample for each class} \item{class}{A list which gives the +predicted class of each sample for each \code{dist} and each of the +\code{ncomp} components. Directly obtained from the \code{predict} output.} +\item{auc}{AUC values} \item{auc.study}{AUC values for each study in mint +models}. +} +\description{ +Computes Leave-One-Group-Out-Cross-Validation (LOGOCV) scores on a +user-input grid to determine optimal values for the parameters in +\code{mint.plsda}. +} +\details{ +This function performs a Leave-One-Group-Out-Cross-Validation (LOGOCV), +where each of \code{study} is left out once. + +The function outputs the optimal number of components that achieve the best +performance based on the overall error rate or BER. The assessment is +data-driven and similar to the process detailed in (Rohart et al., 2016), +where one-sided t-tests assess whether there is a gain in performance when +adding a component to the model. Our experience has shown that in most case, +the optimal number of components is the number of categories in \code{Y} - +1, but it is worth tuning a few extra components to check (see our website +and case studies for more details). + +BER is appropriate in case of an unbalanced number of samples per class as +it calculates the average proportion of wrongly classified samples in each +class, weighted by the number of samples in each class. BER is less biased +towards majority classes during the performance assessment. + +More details about the prediction distances in \code{?predict} and the +supplemental material of the mixOmics article (Rohart et al. 2017). +} +\examples{ +# set up data +data(stemcells) +data <- stemcells$gene +type.id <- stemcells$celltype +exp <- stemcells$study + +# tune number of components +tune_res <- tune.mint.plsda(X = data,Y = type.id, ncomp=5, + near.zero.var=FALSE, + study=exp) + +plot(tune_res) +tune_res$choice.ncomp # 1 component +} +\references{ +Rohart F, Eslami A, Matigian, N, Bougeard S, Lê Cao K-A (2017). +MINT: A multivariate integrative approach to identify a reproducible +biomarker signature across multiple experiments and platforms. BMC +Bioinformatics 18:128. + +mixOmics article: + +Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics +feature selection and multiple data integration. PLoS Comput Biol 13(11): +e1005752 +} +\seealso{ +\code{\link{mint.plsda}} and http://www.mixOmics.org for more +details. +} +\author{ +Florian Rohart, Al J Abadi +} +\keyword{dplot} +\keyword{multivariate} diff --git a/man/tune.mint.splsda.Rd b/man/tune.mint.splsda.Rd index 05896dd3..be0f7386 100644 --- a/man/tune.mint.splsda.Rd +++ b/man/tune.mint.splsda.Rd @@ -9,18 +9,18 @@ tune.mint.splsda( Y, ncomp = 1, study, - test.keepX = c(5, 10, 15), + test.keepX = NULL, already.tested.X, - dist = c("max.dist", "centroids.dist", "mahalanobis.dist"), - measure = c("BER", "overall"), - auc = FALSE, - progressBar = FALSE, scale = TRUE, tol = 1e-06, max.iter = 100, near.zero.var = FALSE, - light.output = TRUE, - signif.threshold = 0.01 + signif.threshold = 0.01, + dist = c("max.dist", "centroids.dist", "mahalanobis.dist"), + measure = c("BER", "overall"), + auc = FALSE, + progressBar = FALSE, + light.output = TRUE ) } \arguments{ @@ -36,12 +36,27 @@ Default to 1} study} \item{test.keepX}{numeric vector for the different number of variables to -test from the \eqn{X} data set} +test from the \eqn{X} data set. If set to NULL only number of components will be tuned.} \item{already.tested.X}{if \code{ncomp > 1} Numeric vector indicating the number of variables to select from the \eqn{X} data set on the firsts components} +\item{scale}{Logical. If scale = TRUE, each block is standardized to zero +means and unit variances (default: TRUE)} + +\item{tol}{Convergence stopping value.} + +\item{max.iter}{integer, the maximum number of iterations.} + +\item{near.zero.var}{Logical, see the internal \code{\link{nearZeroVar}} +function (should be set to TRUE in particular for data with many zero +values). Default value is FALSE} + +\item{signif.threshold}{numeric between 0 and 1 indicating the significance +threshold required for improvement in error rate of the components. Default +to 0.01.} + \item{dist}{only applies to an object inheriting from \code{"plsda"} or \code{"splsda"} to evaluate the classification performance of the model. Should be a subset of \code{"max.dist"}, \code{"centroids.dist"}, @@ -49,7 +64,8 @@ Should be a subset of \code{"max.dist"}, \code{"centroids.dist"}, \code{\link{predict}}.} \item{measure}{Two misclassification measure are available: overall -misclassification error \code{overall} or the Balanced Error Rate \code{BER}} +misclassification error \code{overall} or the Balanced Error Rate \code{BER}. +Only used when \code{test.keepX = NULL}.} \item{auc}{if \code{TRUE} calculate the Area Under the Curve (AUC) performance of the model.} @@ -57,23 +73,8 @@ performance of the model.} \item{progressBar}{by default set to \code{TRUE} to output the progress bar of the computation.} -\item{scale}{Logical. If scale = TRUE, each block is standardized to zero -means and unit variances (default: TRUE)} - -\item{tol}{Convergence stopping value.} - -\item{max.iter}{integer, the maximum number of iterations.} - -\item{near.zero.var}{Logical, see the internal \code{\link{nearZeroVar}} -function (should be set to TRUE in particular for data with many zero -values). Default value is FALSE} - \item{light.output}{if set to FALSE, the prediction/classification of each sample for each of \code{test.keepX} and each comp is returned.} - -\item{signif.threshold}{numeric between 0 and 1 indicating the significance -threshold required for improvement in error rate of the components. Default -to 0.01.} } \value{ The returned value is a list with components: @@ -89,21 +90,29 @@ and for each component computed with the optimal keepX} \item{predict}{Prediction values for each sample, each \code{test.keepX} and each comp.} \item{class}{Predicted class for each sample, each \code{test.keepX} and each comp.} + +If \code{test.keepX = NULL}, returns: +\item{study.specific.error}{A list that gives BER, overall error rate and +error rate per class, for each study} \item{global.error}{A list that gives +BER, overall error rate and error rate per class for all samples} +\item{predict}{A list of length \code{ncomp} that produces the predicted +values of each sample for each class} \item{class}{A list which gives the +predicted class of each sample for each \code{dist} and each of the +\code{ncomp} components. Directly obtained from the \code{predict} output.} +\item{auc}{AUC values} \item{auc.study}{AUC values for each study in mint +models}. } \description{ Computes Leave-One-Group-Out-Cross-Validation (LOGOCV) scores on a -user-input grid to determine optimal values for the sparsity parameters in +user-input grid to determine optimal values for the parameters in \code{mint.splsda}. } \details{ This function performs a Leave-One-Group-Out-Cross-Validation (LOGOCV), -where each of \code{study} is left out once. It returns a list of variables -of \code{X} that were selected on each of the \code{ncomp} components. Then, -a \code{\link{mint.splsda}} can be performed with \code{keepX} set as the -output \code{choice.keepX}. +where each of \code{study} is left out once. -All component \eqn{1:\code{ncomp}} are tuned, except the first ones for -which a \code{already.tested.X} is provided. See examples below. +When \code{test.keepX} is not NULL, all component \eqn{1:\code{ncomp}} are tuned to identify number of variables to keep, +except the first ones for which a \code{already.tested.X} is provided. See examples below. The function outputs the optimal number of components that achieve the best performance based on the overall error rate or BER. The assessment is @@ -123,31 +132,36 @@ More details about the prediction distances in \code{?predict} and the supplemental material of the mixOmics article (Rohart et al. 2017). } \examples{ +# set up data data(stemcells) -data = stemcells$gene -type.id = stemcells$celltype -exp = stemcells$study - -res = mint.splsda(X=data,Y=type.id,ncomp=3,keepX=c(10,5,15),study=exp) -out = tune.mint.splsda(X=data,Y=type.id,ncomp=2,near.zero.var=FALSE, -study=exp,test.keepX=seq(1,10,1)) - -out$choice.ncomp -out$choice.keepX - -\dontrun{ - -out = tune.mint.splsda(X=data,Y=type.id,ncomp=2,near.zero.var=FALSE, -study=exp,test.keepX=seq(1,10,1)) -out$choice.keepX - -## only tune component 2 and keeping 10 genes on comp1 -out = tune.mint.splsda(X=data,Y=type.id,ncomp=2, study=exp, -already.tested.X = c(10), -test.keepX=seq(1,10,1)) -out$choice.keepX - -} +data <- stemcells$gene +type.id <- stemcells$celltype +exp <- stemcells$study + +# tune number of components +tune_res <- tune.mint.splsda(X = data,Y = type.id, ncomp=5, + near.zero.var=FALSE, + study=exp, + test.keepX = NULL) + +plot(tune_res) +tune_res$choice.ncomp # 1 component + +## tune number of variables to keep +tune_res <- tune.mint.splsda(X = data,Y = type.id, ncomp = 1, + near.zero.var = FALSE, + study=exp, + test.keepX=seq(1,10,1)) + +plot(tune_res) +tune_res$choice.keepX # 9 variables to keep on component 1 + +## only tune component 3 and keeping 10 genes on comp1 +tune_res <- tune.mint.splsda(X = data, Y = type.id, ncomp = 2, study = exp, + already.tested.X = c(9), + test.keepX = seq(1,10,1)) +plot(tune_res) +tune_res$choice.keepX # 10 variables to keep on comp2 } \references{ Rohart F, Eslami A, Matigian, N, Bougeard S, Lê Cao K-A (2017). diff --git a/man/tune.pca.Rd b/man/tune.pca.Rd index 8e8593a3..dc808fbe 100644 --- a/man/tune.pca.Rd +++ b/man/tune.pca.Rd @@ -8,8 +8,8 @@ tune.pca( X, ncomp = NULL, center = TRUE, - scale = FALSE, - max.iter = 500, + scale = TRUE, + max.iter = 100, tol = 1e-09, logratio = c("none", "CLR", "ILR"), V = NULL, @@ -87,9 +87,22 @@ internal pre-processing step, through \code{\link{logratio.transfo}} and \code{\link{withinVariation}} respectively. } \examples{ +# load data data(liver.toxicity) + +# run tuning tune <- tune.pca(liver.toxicity$gene, center = TRUE, scale = TRUE) -tune +plot(tune) + +# set up multilevel dataset +repeat.indiv <- c(1, 2, 1, 2, 1, 2, 1, 2, 3, 3, 4, 3, 4, 3, 4, 4, 5, 6, 5, 5, + 6, 5, 6, 7, 7, 8, 6, 7, 8, 7, 8, 8, 9, 10, 9, 10, 11, 9, 9, + 10, 11, 12, 12, 10, 11, 12, 11, 12, 13, 14, 13, 14, 13, 14, + 13, 14, 15, 16, 15, 16, 15, 16, 15, 16) +design <- data.frame(sample = repeat.indiv) + +# run tuning +tune <- tune.pca(liver.toxicity$gene, center = TRUE, scale = TRUE, multilevel = design) plot(tune) } \seealso{ diff --git a/man/tune.pls.Rd b/man/tune.pls.Rd new file mode 100644 index 00000000..866953f2 --- /dev/null +++ b/man/tune.pls.Rd @@ -0,0 +1,245 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tune.pls.R +\name{tune.pls} +\alias{tune.pls} +\title{Tuning functions for PLS method} +\usage{ +tune.pls( + X, + Y, + ncomp, + validation = c("Mfold", "loo"), + nrepeat = 1, + folds, + measure = NULL, + mode = c("regression", "canonical", "classic"), + scale = TRUE, + logratio = "none", + tol = 1e-06, + max.iter = 100, + near.zero.var = FALSE, + multilevel = NULL, + BPPARAM = SerialParam(), + seed = NULL, + progressBar = FALSE, + ... +) +} +\arguments{ +\item{X}{numeric matrix of predictors with the rows as individual observations.} + +\item{Y}{numeric matrix of response(s) with the rows as individual observations matching \code{X}.} + +\item{ncomp}{Positive Integer. The number of components to include in the +model. Default to 2.} + +\item{validation}{character. What kind of (internal) validation to use, +matching one of \code{"Mfold"} or \code{"loo"} (Leave-One-out). Default is +\code{"Mfold"}.} + +\item{nrepeat}{Positive integer. Number of times the Cross-Validation process +should be repeated. \code{nrepeat > 2} is required for robust tuning. See +details.} + +\item{folds}{Positive Integer, The folds in the Mfold cross-validation.} + +\item{measure}{The tuning measure to use. Cannot be NULL when applied to sPLS1 object. See details.} + +\item{mode}{Character string indicating the type of PLS algorithm to use. One +of \code{"regression"}, \code{"canonical"}, \code{"invariant"} or \code{"classic"}. See Details.} + +\item{scale}{Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE} + +\item{logratio}{Character, one of ('none','CLR') specifies the log ratio transformation to deal with compositional values that may arise from specific normalisation in sequencing data. Default to 'none'. See ?logratio.transfo for details.} + +\item{tol}{Positive numeric used as convergence criteria/tolerance during the iterative process. Default to 1e-06.} + +\item{max.iter}{Integer, the maximum number of iterations. Default to 100.} + +\item{near.zero.var}{Logical, see the internal nearZeroVar function (should be set to TRUE in particular for data with many zero values). Setting this argument to FALSE (when appropriate) will speed up the computations. Default value is FALSE.} + +\item{multilevel}{Numeric, design matrix for repeated measurement analysis, where multilevel decomposition is required. For a one factor decomposition, the repeated measures on each individual, i.e. the individuals ID is input as the first column. For a 2 level factor decomposition then 2nd AND 3rd columns indicate those factors. See examples.} + +\item{BPPARAM}{A \linkS4class{BiocParallelParam} object indicating the type +of parallelisation. See examples in \code{?tune.spca}.} + +\item{seed}{set a number here if you want the function to give reproducible outputs. +Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'.} + +\item{progressBar}{Logical. If \code{TRUE} a progress bar is shown as the +computation completes. Default to \code{FALSE}.} + +\item{...}{Optional parameters passed to \code{\link{pls}}} +} +\value{ +Returns a list with the following components for every repeat: +\item{MSEP}{Mean Square Error Prediction for each \eqn{Y} variable, only +applies to object inherited from \code{"pls"}, and \code{"spls"}. Only +available when in regression (s)PLS.} +\item{RMSEP}{Root Mean Square Error Prediction for each \eqn{Y} variable, only +applies to object inherited from \code{"pls"}, and \code{"spls"}. Only +available when in regression (s)PLS.} +\item{R2}{a matrix of \eqn{R^2} values of the \eqn{Y}-variables for models +with \eqn{1, \ldots ,}\code{ncomp} components, only applies to object +inherited from \code{"pls"}, and \code{"spls"}. Only available when in +regression (s)PLS.} +\item{Q2}{if \eqn{Y} contains one variable, a vector of \eqn{Q^2} values +else a list with a matrix of \eqn{Q^2} values for each \eqn{Y}-variable. +Note that in the specific case of an sPLS model, it is better to have a look +at the Q2.total criterion, only applies to object inherited from +\code{"pls"}, and \code{"spls"}. Only available when in regression (s)PLS.} +\item{Q2.total}{a vector of \eqn{Q^2}-total values for models with \eqn{1, +\ldots ,}\code{ncomp} components, only applies to object inherited from +\code{"pls"}, and \code{"spls"}. Available in both (s)PLS modes.} +\item{RSS}{Residual Sum of Squares across all selected features and the +components.} +\item{PRESS}{Predicted Residual Error Sum of Squares across all selected +features and the components.} +\item{features}{a list of features selected across the +folds (\code{$stable.X} and \code{$stable.Y}) for the \code{keepX} and +\code{keepY} parameters from the input object. Note, this will be \code{NULL} +if using standard (non-sparse) PLS.} +\item{cor.tpred, cor.upred}{Correlation between the +predicted and actual components for X (t) and Y (u)} +\item{RSS.tpred, RSS.upred}{Residual Sum of Squares between the +predicted and actual components for X (t) and Y (u)} +} +\description{ +Computes M-fold or Leave-One-Out Cross-Validation scores on a user-input +grid to determine optimal values for the parameters in \code{spls}. +} +\details{ +This tuning function should be used to tune the number of components to select for spls models. +} +\section{folds}{ + +During a cross-validation (CV), data are randomly split into \code{M} +subgroups (folds). \code{M-1} subgroups are then used to train submodels +which would be used to predict prediction accuracy statistics for the +held-out (test) data. All subgroups are used as the test data exactly once. +If \code{validation = "loo"}, leave-one-out CV is used where each group +consists of exactly one sample and hence \code{M == N} where N is the number +of samples. +} + +\section{nrepeat}{ + +The cross-validation process is repeated \code{nrepeat} times and the +accuracy measures are averaged across repeats. If \code{validation = "loo"}, +the process does not need to be repeated as there is only one way to split N +samples into N groups and hence nrepeat is forced to be 1. +} + +\section{measure}{ + +\itemize{ +\item \bold{For PLS2} Two measures of accuracy are available: Correlation +(\code{cor}, used as default), as well as the Residual Sum of Squares +(\code{RSS}). For \code{cor}, the parameters which would maximise the +correlation between the predicted and the actual components are chosen. The +\code{RSS} measure tries to predict the held-out data by matrix +reconstruction and seeks to minimise the error between actual and predicted +values. For \code{mode='canonical'}, The X matrix is used to calculate the +\code{RSS}, while for others modes the \code{Y} matrix is used. This measure +gives more weight to any large errors and is thus sensitive to outliers. It +also intrinsically selects less number of features on the \code{Y} block +compared to \code{measure='cor'}. +\item \bold{For PLS1} Four measures of accuracy are available: Mean Absolute +Error (\code{MAE}), Mean Square Error (\code{MSE}, used as default), +\code{Bias} and \code{R2}. Both MAE and MSE average the model prediction +error. MAE measures the average magnitude of the errors without considering +their direction. It is the average over the fold test samples of the absolute +differences between the Y predictions and the actual Y observations. The MSE +also measures the average magnitude of the error. Since the errors are +squared before they are averaged, the MSE tends to give a relatively high +weight to large errors. The Bias is the average of the differences between +the Y predictions and the actual Y observations and the R2 is the correlation +between the predictions and the observations. +} +} + +\section{Optimisation Process}{ + +The optimisation process is data-driven and similar to the process detailed +in (Rohart et al., 2016), where one-sided t-tests assess whether there is a +gain in performance when incrementing the number of features or components in +the model. However, it will assess all the provided grid through pair-wise +comparisons as the performance criteria do not always change linearly with +respect to the added number of features or components. +} + +\section{more}{ + +See also \code{?perf} for more details. +} + +\examples{ +# set up data +data(liver.toxicity) +X <- liver.toxicity$gene +Y <- liver.toxicity$clinic + +# tune PLS2 model to find optimal number of components +tune.res <- tune.pls(X, Y, ncomp = 10, measure = "cor", + folds = 5, nrepeat = 3, progressBar = TRUE) +plot(tune.res) # plot outputs + +# PLS1 model example +Y1 <- liver.toxicity$clinic[,1] + +tune.res <- tune.pls(X, Y1, ncomp = 10, measure = "cor", + folds = 5, nrepeat = 3, progressBar = TRUE) + +plot(tune.res) + +# Multilevel PLS2 model +repeat.indiv <- c(1, 2, 1, 2, 1, 2, 1, 2, 3, 3, 4, 3, 4, 3, 4, 4, 5, 6, 5, 5, + 6, 5, 6, 7, 7, 8, 6, 7, 8, 7, 8, 8, 9, 10, 9, 10, 11, 9, 9, + 10, 11, 12, 12, 10, 11, 12, 11, 12, 13, 14, 13, 14, 13, 14, + 13, 14, 15, 16, 15, 16, 15, 16, 15, 16) +design <- data.frame(sample = repeat.indiv) + +tune.res <- tune.pls(X, Y1, ncomp = 10, measure = "cor", multilevel = design, + folds = 5, nrepeat = 3, progressBar = TRUE) + +plot(tune.res) +} +\references{ +mixOmics article: + +Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics +feature selection and multiple data integration. PLoS Comput Biol 13(11): +e1005752 + +PLS and PLS citeria for PLS regression: Tenenhaus, M. (1998). La regression +PLS: theorie et pratique. Paris: Editions Technic. + +Chavent, Marie and Patouille, Brigitte (2003). Calcul des coefficients de +regression et du PRESS en regression PLS1. Modulad n, 30 1-11. (this is the +formula we use to calculate the Q2 in perf.pls and perf.spls) + +Mevik, B.-H., Cederkvist, H. R. (2004). Mean Squared Error of Prediction +(MSEP) Estimates for Principal Component Regression (PCR) and Partial Least +Squares Regression (PLSR). Journal of Chemometrics 18(9), 422-429. + +Sparse PLS regression mode: + +Lê Cao, K. A., Rossouw D., Robert-Granie, C. and Besse, P. (2008). A sparse +PLS for variable selection when integrating Omics data. Statistical +Applications in Genetics and Molecular Biology 7, article 35. + +One-sided t-tests (suppl material): + +Rohart F, Mason EA, Matigian N, Mosbergen R, Korn O, Chen T, Butcher S, +Patel J, Atkinson K, Khosrotehrani K, Fisk NM, Lê Cao K-A&, Wells CA& +(2016). A Molecular Classification of Human Mesenchymal Stromal Cells. PeerJ +4:e1845. +} +\seealso{ +\code{\link{splsda}}, \code{\link{predict.splsda}}, and http://www.mixOmics.org for more details. +} +\author{ +Kim-Anh Lê Cao, Al J Abadi, Benoit Gautier, Francois Bartolo and Florian Rohart. +} +\keyword{multivariate} +\keyword{regression} diff --git a/man/tune.plsda.Rd b/man/tune.plsda.Rd new file mode 100644 index 00000000..e6c7cf2b --- /dev/null +++ b/man/tune.plsda.Rd @@ -0,0 +1,194 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tune.plsda.R +\name{tune.plsda} +\alias{tune.plsda} +\title{Tuning functions for PLS-DA method} +\usage{ +tune.plsda( + X, + Y, + ncomp = 1, + scale = TRUE, + logratio = c("none", "CLR"), + max.iter = 100, + tol = 1e-06, + near.zero.var = FALSE, + multilevel = NULL, + validation = "Mfold", + folds = 10, + nrepeat = 1, + signif.threshold = 0.01, + dist = "all", + auc = FALSE, + progressBar = FALSE, + light.output = TRUE, + BPPARAM = SerialParam(), + seed = NULL +) +} +\arguments{ +\item{X}{numeric matrix of predictors. \code{NA}s are allowed.} + +\item{Y}{\code{if(method = 'spls')} numeric vector or matrix of continuous +responses (for multi-response models) \code{NA}s are allowed.} + +\item{ncomp}{the number of components to include in the model.} + +\item{scale}{Logical. If scale = TRUE, each block is standardized to zero +means and unit variances (default: TRUE)} + +\item{logratio}{one of ('none','CLR'). Default to 'none'} + +\item{max.iter}{integer, the maximum number of iterations.} + +\item{tol}{Convergence stopping value.} + +\item{near.zero.var}{Logical, see the internal \code{\link{nearZeroVar}} +function (should be set to TRUE in particular for data with many zero +values). Default value is FALSE} + +\item{multilevel}{Design matrix for multilevel analysis (for repeated +measurements) that indicates the repeated measures on each individual, i.e. +the individuals ID. See Details.} + +\item{validation}{character. What kind of (internal) validation to use, +matching one of \code{"Mfold"} or \code{"loo"} (short for 'leave-one-out'). +Default is \code{"Mfold"}.} + +\item{folds}{the folds in the Mfold cross-validation. See Details.} + +\item{nrepeat}{Number of times the Cross-Validation process is repeated.} + +\item{signif.threshold}{numeric between 0 and 1 indicating the significance +threshold required for improvement in error rate of the components. Default +to 0.01.} + +\item{dist}{Distance metric. Should be a subset of "max.dist", "centroids.dist", "mahalanobis.dist" or "all". Default is "all"} + +\item{auc}{if \code{TRUE} calculate the Area Under the Curve (AUC)} + +\item{progressBar}{by default set to \code{TRUE} to output the progress bar +of the computation.} + +\item{light.output}{if set to FALSE, the prediction/classification of each +sample for each of \code{test.keepX} and each comp is returned.} + +\item{BPPARAM}{A \linkS4class{BiocParallelParam} object indicating the type +of parallelisation. See examples in \code{?tune.spca}.} + +\item{seed}{set a number here if you want the function to give reproducible outputs. +Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'.} +} +\value{ +matrix of classification error rate estimation. +The dimensions correspond to the components in the +model and to the prediction method used, respectively. + +\item{auc}{Averaged AUC values +over the \code{nrepeat}} + +\item{cor.value}{only if multilevel analysis with 2 factors: correlation +between latent variables.} +} +\description{ +Computes M-fold or Leave-One-Out Cross-Validation scores on a user-input +grid to determine optimal values for the parameters in \code{plsda}. +} +\details{ +This tuning function should be used to tune the parameters in the +\code{plsda} function (number of components and distance metric to select). + +For a PLS-DA, M-fold or LOO cross-validation is performed with stratified +subsampling where all classes are represented in each fold. + +If \code{validation = "loo"}, leave-one-out cross-validation is performed. +By default \code{folds} is set to the number of unique individuals. + +The function outputs the optimal number of components that achieve the best +performance based on the overall error rate or BER. The assessment is +data-driven and similar to the process detailed in (Rohart et al., 2016), +where one-sided t-tests assess whether there is a gain in performance when +adding a component to the model. Our experience has shown that in most case, +the optimal number of components is the number of categories in \code{Y} - +1, but it is worth tuning a few extra components to check (see our website +and case studies for more details). + +For PLS-DA multilevel one-factor analysis, M-fold or LOO cross-validation +is performed where all repeated measurements of one sample are in the same +fold. Note that logratio transform and the multilevel analysis are performed +internally and independently on the training and test set. + +For a PLS-DA multilevel two-factor analysis, the correlation between +components from the within-subject variation of X and the \code{cond} matrix +is computed on the whole data set. The reason why we cannot obtain a +cross-validation error rate as for the pls-DA one-factor analysis is +because of the difficulty to decompose and predict the within matrices +within each fold. + +For a PLS two-factor analysis a PLS canonical mode is run, and the +correlation between components from the within-subject variation of X and Y +is computed on the whole data set. + +If \code{validation = "Mfold"}, M-fold cross-validation is performed. How +many folds to generate is selected by specifying the number of folds in +\code{folds}. + +If \code{auc = TRUE} and there are more than 2 categories in \code{Y}, the +Area Under the Curve is averaged using one-vs-all comparison. Note however +that the AUC criteria may not be particularly insightful as the prediction +threshold we use in PLS-DA differs from an AUC threshold (PLS-DA relies on +prediction distances for predictions, see \code{?predict.plsda} for more +details) and the supplemental material of the mixOmics article (Rohart et +al. 2017). + +BER is appropriate in case of an unbalanced number of samples per class as +it calculates the average proportion of wrongly classified samples in each +class, weighted by the number of samples in each class. BER is less biased +towards majority classes during the performance assessment. + +More details about the prediction distances in \code{?predict} and the +supplemental material of the mixOmics article (Rohart et al. 2017). + +The tune.plsda() function calls older function perf() to perform this cross-validation, +for more details see the perf() help pages. +} +\examples{ +## Example: analysis with PLS-DA +data(breast.tumors) + +# tune components and distance +tune = tune.plsda(breast.tumors$gene.exp, as.factor(breast.tumors$sample$treatment), + ncomp = 5, logratio = "none", + nrepeat = 10, folds = 10, + progressBar = TRUE, + seed = 20) # set for reproducibility of example only +plot(tune) # optimal distance = centroids.dist +tune$choice.ncomp # optimal component number = 3 + +## Example: multilevel PLS-DA +data(vac18) +design <- data.frame(sample = vac18$sample) # set the multilevel design + +tune1 <- tune.plsda(vac18$genes, vac18$stimulation, + ncomp = 5, multilevel = design, + nrepeat = 10, folds = 10, + seed = 20) +plot(tune1) +} +\references{ +mixOmics article: + +Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics +feature selection and multiple data integration. PLoS Comput Biol 13(11): +e1005752 +} +\seealso{ +\code{\link{splsda}}, \code{\link{predict.splsda}} and +http://www.mixOmics.org for more details. +} +\author{ +Kim-Anh Lê Cao, Benoit Gautier, Francois Bartolo, Florian Rohart, +Al J Abadi +} +\keyword{multivariate} +\keyword{regression} diff --git a/man/tune.rcc.Rd b/man/tune.rcc.Rd index 347342dc..6f43fd7b 100644 --- a/man/tune.rcc.Rd +++ b/man/tune.rcc.Rd @@ -11,7 +11,6 @@ tune.rcc( grid2 = seq(0.001, 1, length = 5), validation = c("loo", "Mfold"), folds = 10, - plot = TRUE, BPPARAM = SerialParam(), seed = NULL ) @@ -34,9 +33,6 @@ method to use, (partially) matching one of \code{"loo"} (leave-one-out) or \item{folds}{positive integer. Number of folds to use if \code{validation="Mfold"}. Defaults to \code{folds=10}.} -\item{plot}{logical argument indicating whether an image map should be -plotted by calling the \code{imgCV} function.} - \item{BPPARAM}{a BiocParallel parameter object; see \code{BiocParallel::bpparam} for details. Default is \code{MulticoreParam()} for parallel processing.} @@ -73,13 +69,16 @@ of the data matrix using the \code{nipals} function. Otherwise, missing values are handled by casewise deletion in the \code{rcc} function. } \examples{ +#load data data(nutrimouse) X <- nutrimouse$lipid Y <- nutrimouse$gene -## this can take some seconds +# run tuning +tune_res <- tune.rcc(X, Y, validation = "Mfold") -tune.rcc(X, Y, validation = "Mfold") +# plot output +plot(tune_res) } \seealso{ \code{\link{image.tune.rcc}} and http://www.mixOmics.org for more diff --git a/man/tune.spca.Rd b/man/tune.spca.Rd index d6a78ee3..1c3340fd 100644 --- a/man/tune.spca.Rd +++ b/man/tune.spca.Rd @@ -32,7 +32,7 @@ ncol(X))}} \item{folds}{Number of folds in 'Mfold' cross-validation. See details.} \item{test.keepX}{numeric vector for the different number of variables to -test from the \eqn{X} data set} +test from the \eqn{X} data set.} \item{center}{(Default=TRUE) Logical, whether the variables should be shifted to be zero centered. Only set to FALSE if data have already been centered. diff --git a/man/tune.spls.Rd b/man/tune.spls.Rd index d9250d14..ba6f0c0f 100644 --- a/man/tune.spls.Rd +++ b/man/tune.spls.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/tune.spls.R \name{tune.spls} \alias{tune.spls} -\title{Tuning functions for sPLS and PLS functions} +\title{Tuning functions for sPLS method} \usage{ tune.spls( X, @@ -10,15 +10,20 @@ tune.spls( test.keepX = NULL, test.keepY = NULL, ncomp, + mode = c("regression", "canonical", "classic"), + scale = TRUE, + logratio = "none", + tol = 1e-09, + max.iter = 100, + near.zero.var = FALSE, + multilevel = NULL, validation = c("Mfold", "loo"), nrepeat = 1, folds, - mode = c("regression", "canonical", "classic"), measure = NULL, BPPARAM = SerialParam(), seed = NULL, progressBar = FALSE, - limQ2 = 0.0975, ... ) } @@ -36,6 +41,21 @@ test from the \eqn{Y} data set. Default to \code{ncol(Y)}.} \item{ncomp}{Positive Integer. The number of components to include in the model. Default to 2.} +\item{mode}{Character string indicating the type of PLS algorithm to use. One +of \code{"regression"}, \code{"canonical"}, \code{"invariant"} or \code{"classic"}. See Details.} + +\item{scale}{Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE} + +\item{logratio}{Character, one of ('none','CLR') specifies the log ratio transformation to deal with compositional values that may arise from specific normalisation in sequencing data. Default to 'none'. See ?logratio.transfo for details.} + +\item{tol}{Positive numeric used as convergence criteria/tolerance during the iterative process. Default to 1e-06.} + +\item{max.iter}{Integer, the maximum number of iterations. Default to 100.} + +\item{near.zero.var}{Logical, see the internal nearZeroVar function (should be set to TRUE in particular for data with many zero values). Setting this argument to FALSE (when appropriate) will speed up the computations. Default value is FALSE.} + +\item{multilevel}{Numeric, design matrix for repeated measurement analysis, where multilevel decomposition is required. For a one factor decomposition, the repeated measures on each individual, i.e. the individuals ID is input as the first column. For a 2 level factor decomposition then 2nd AND 3rd columns indicate those factors. See examples.} + \item{validation}{character. What kind of (internal) validation to use, matching one of \code{"Mfold"} or \code{"loo"} (Leave-One-out). Default is \code{"Mfold"}.} @@ -46,10 +66,7 @@ details.} \item{folds}{Positive Integer, The folds in the Mfold cross-validation.} -\item{mode}{Character string indicating the type of PLS algorithm to use. One -of \code{"regression"}, \code{"canonical"}, \code{"invariant"} or \code{"classic"}. See Details.} - -\item{measure}{The tuning measure to use. See details.} +\item{measure}{The tuning measure to use. Cannot be NULL when applied to sPLS1 object. See details.} \item{BPPARAM}{A \linkS4class{BiocParallelParam} object indicating the type of parallelisation. See examples in \code{?tune.spca}.} @@ -60,12 +77,11 @@ Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM' \item{progressBar}{Logical. If \code{TRUE} a progress bar is shown as the computation completes. Default to \code{FALSE}.} -\item{limQ2}{Q2 threshold for recommending optimal \code{ncomp}.} - \item{...}{Optional parameters passed to \code{\link{spls}}} } \value{ -A list that contains: \item{cor.pred}{The correlation of predicted vs +If \code{test.keepX != NULL} and \code{test.keepY != NULL} returns a list that contains: +\item{cor.pred}{The correlation of predicted vs actual components from X (t) and Y (u) for each component}\item{RSS.pred}{The Residual Sum of Squares of predicted vs actual components from X (t) and Y (u) for each component} @@ -75,11 +91,46 @@ A list that contains: \item{cor.pred}{The correlation of predicted vs \item{choice.ncomp}{returns the optimal number of components for the model fitted with \code{$choice.keepX} and \code{$choice.keepY} } \item{call}{The functioncal call including the parameteres used.} + +If \code{test.keepX = NULL} and \code{test.keepY = NULL} returns a list with the following components for every repeat: +\item{MSEP}{Mean Square Error Prediction for each \eqn{Y} variable, only +applies to object inherited from \code{"pls"}, and \code{"spls"}. Only +available when in regression (s)PLS.} +\item{RMSEP}{Root Mean Square Error Prediction for each \eqn{Y} variable, only +applies to object inherited from \code{"pls"}, and \code{"spls"}. Only +available when in regression (s)PLS.} +\item{R2}{a matrix of \eqn{R^2} values of the \eqn{Y}-variables for models +with \eqn{1, \ldots ,}\code{ncomp} components, only applies to object +inherited from \code{"pls"}, and \code{"spls"}. Only available when in +regression (s)PLS.} +\item{Q2}{if \eqn{Y} contains one variable, a vector of \eqn{Q^2} values +else a list with a matrix of \eqn{Q^2} values for each \eqn{Y}-variable. +Note that in the specific case of an sPLS model, it is better to have a look +at the Q2.total criterion, only applies to object inherited from +\code{"pls"}, and \code{"spls"}. Only available when in regression (s)PLS.} +\item{Q2.total}{a vector of \eqn{Q^2}-total values for models with \eqn{1, +\ldots ,}\code{ncomp} components, only applies to object inherited from +\code{"pls"}, and \code{"spls"}. Available in both (s)PLS modes.} +\item{RSS}{Residual Sum of Squares across all selected features and the +components.} +\item{PRESS}{Predicted Residual Error Sum of Squares across all selected +features and the components.} +\item{features}{a list of features selected across the +folds (\code{$stable.X} and \code{$stable.Y}) for the \code{keepX} and +\code{keepY} parameters from the input object. Note, this will be \code{NULL} +if using standard (non-sparse) PLS.} +\item{cor.tpred, cor.upred}{Correlation between the +predicted and actual components for X (t) and Y (u)} +\item{RSS.tpred, RSS.upred}{Residual Sum of Squares between the +predicted and actual components for X (t) and Y (u)} } \description{ -This function uses repeated cross-validation to tune hyperparameters such as -the number of features to select and possibly the number of components to -extract. +Computes M-fold or Leave-One-Out Cross-Validation scores on a user-input +grid to determine optimal values for the parameters in \code{spls}. +} +\details{ +This tuning function should be used to tune the parameters in the +\code{spls} function (number of components and number of variables to select). } \section{folds}{ @@ -144,22 +195,65 @@ See also \code{?perf} for more details. } \examples{ +## sPLS2 model example (more than one Y outcome variable) -\dontrun{ +# set up data data(liver.toxicity) X <- liver.toxicity$gene Y <- liver.toxicity$clinic -set.seed(42) -tune.res = tune.spls( X, Y, ncomp = 3, - test.keepX = c(5, 10, 15), - test.keepY = c(3, 6, 8), measure = "cor", - folds = 5, nrepeat = 3, progressBar = TRUE) -tune.res$choice.ncomp -tune.res$choice.keepX -tune.res$choice.keepY -# plot the results -plot(tune.res) -} + +# tune spls model for components only +tune.res.ncomp <- tune.spls( X, Y, ncomp = 5, + test.keepX = NULL, + test.keepY = NULL, measure = "cor", + folds = 5, nrepeat = 3, progressBar = TRUE) +plot(tune.res.ncomp) # plot outputs + +# tune spls model for number of X and Y variables to keep +tune.res <- tune.spls( X, Y, ncomp = 3, + test.keepX = c(5, 10, 15), + test.keepY = c(3, 6, 8), measure = "cor", + folds = 5, nrepeat = 3, progressBar = TRUE) +plot(tune.res) # plot outputs + +## sPLS1 model example (only one Y outcome variable) + +# set up data +Y1 <- liver.toxicity$clinic[,1] + +# tune spls model for components only +plot(tune.spls(X, Y1, ncomp = 3, + folds = 3, + test.keepX = NULL, test.keepY = NULL)) + +# tune spls model for number of X variables to keep, note for sPLS1 models 'measure' needs to be set +plot(tune.spls(X, Y1, ncomp = 3, + folds = 3, measure = "MSE", + test.keepX = c(5, 10, 15), test.keepY = c(3, 6, 8))) + + +## sPLS2 multilevel model example + +# set up multilevel design +repeat.indiv <- c(1, 2, 1, 2, 1, 2, 1, 2, 3, 3, 4, 3, 4, 3, 4, 4, 5, 6, 5, 5, + 6, 5, 6, 7, 7, 8, 6, 7, 8, 7, 8, 8, 9, 10, 9, 10, 11, 9, 9, + 10, 11, 12, 12, 10, 11, 12, 11, 12, 13, 14, 13, 14, 13, 14, + 13, 14, 15, 16, 15, 16, 15, 16, 15, 16) +design <- data.frame(sample = repeat.indiv) + +# tune spls model for components only +tune.res.ncomp <- tune.spls( X, Y, ncomp = 5, + test.keepX = NULL, + test.keepY = NULL, measure = "cor", multilevel = design, + folds = 5, nrepeat = 3, progressBar = TRUE) +plot(tune.res.ncomp) # plot outputs + +# tune spls model for number of X and Y variables to keep +tune.res <- tune.spls( X, Y, ncomp = 3, + test.keepX = c(5, 10, 15), + test.keepY = c(3, 6, 8), measure = "cor", multilevel = design, + folds = 5, nrepeat = 3, progressBar = TRUE) +plot(tune.res) # plot outputs } \references{ mixOmics article: diff --git a/man/tune.splsda.Rd b/man/tune.splsda.Rd index a322ebb7..cfe20a24 100644 --- a/man/tune.splsda.Rd +++ b/man/tune.splsda.Rd @@ -8,23 +8,23 @@ tune.splsda( X, Y, ncomp = 1, - test.keepX = c(5, 10, 15), + test.keepX = NULL, already.tested.X, + scale = TRUE, + logratio = c("none", "CLR"), + max.iter = 100, + tol = 1e-06, + near.zero.var = FALSE, + multilevel = NULL, validation = "Mfold", folds = 10, + nrepeat = 1, + signif.threshold = 0.01, dist = "max.dist", measure = "BER", - scale = TRUE, auc = FALSE, progressBar = FALSE, - tol = 1e-06, - max.iter = 100, - near.zero.var = FALSE, - nrepeat = 1, - logratio = c("none", "CLR"), - multilevel = NULL, light.output = TRUE, - signif.threshold = 0.01, BPPARAM = SerialParam(), seed = NULL ) @@ -38,58 +38,60 @@ responses (for multi-response models) \code{NA}s are allowed.} \item{ncomp}{the number of components to include in the model.} \item{test.keepX}{numeric vector for the different number of variables to -test from the \eqn{X} data set} +test from the \eqn{X} data set. If set to NULL, tuning will be performed on ncomp +using all variables in the \eqn{X} data set.} \item{already.tested.X}{Optional, if \code{ncomp > 1} A numeric vector indicating the number of variables to select from the \eqn{X} data set on the firsts components.} -\item{validation}{character. What kind of (internal) validation to use, -matching one of \code{"Mfold"} or \code{"loo"} (short for 'leave-one-out'). -Default is \code{"Mfold"}.} - -\item{folds}{the folds in the Mfold cross-validation. See Details.} - -\item{dist}{distance metric to use for \code{splsda} to estimate the -classification error rate, should be a subset of \code{"centroids.dist"}, -\code{"mahalanobis.dist"} or \code{"max.dist"} (see Details).} - -\item{measure}{Three misclassification measure are available: overall -misclassification error \code{overall}, the Balanced Error Rate \code{BER} -or the Area Under the Curve \code{AUC}} - \item{scale}{Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE)} -\item{auc}{if \code{TRUE} calculate the Area Under the Curve (AUC) -performance of the model based on the optimisation measure \code{measure}.} +\item{logratio}{one of ('none','CLR'). Default to 'none'} -\item{progressBar}{by default set to \code{TRUE} to output the progress bar -of the computation.} +\item{max.iter}{integer, the maximum number of iterations.} \item{tol}{Convergence stopping value.} -\item{max.iter}{integer, the maximum number of iterations.} - \item{near.zero.var}{Logical, see the internal \code{\link{nearZeroVar}} function (should be set to TRUE in particular for data with many zero values). Default value is FALSE} -\item{nrepeat}{Number of times the Cross-Validation process is repeated.} - -\item{logratio}{one of ('none','CLR'). Default to 'none'} - \item{multilevel}{Design matrix for multilevel analysis (for repeated measurements) that indicates the repeated measures on each individual, i.e. the individuals ID. See Details.} -\item{light.output}{if set to FALSE, the prediction/classification of each -sample for each of \code{test.keepX} and each comp is returned.} +\item{validation}{character. What kind of (internal) validation to use, +matching one of \code{"Mfold"} or \code{"loo"} (short for 'leave-one-out'). +Default is \code{"Mfold"}.} + +\item{folds}{the folds in the Mfold cross-validation. See Details.} + +\item{nrepeat}{Number of times the Cross-Validation process is repeated.} \item{signif.threshold}{numeric between 0 and 1 indicating the significance threshold required for improvement in error rate of the components. Default to 0.01.} +\item{dist}{distance metric to use for \code{splsda} to estimate the +classification error rate, should one of \code{"centroids.dist"}, +\code{"mahalanobis.dist"} or \code{"max.dist"} (see Details). +If \code{test.keepX = NULL} multiple distances can be inputted or "all".} + +\item{measure}{Three misclassification measure are available: overall +misclassification error \code{overall}, the Balanced Error Rate \code{BER} +or the Area Under the Curve \code{AUC}. Only used when \code{test.keepX} is not NULL.} + +\item{auc}{if \code{TRUE} calculate the Area Under the Curve (AUC) +performance of the model based on the optimisation measure \code{measure}.} + +\item{progressBar}{by default set to \code{TRUE} to output the progress bar +of the computation.} + +\item{light.output}{if set to FALSE, the prediction/classification of each +sample for each of \code{test.keepX} and each comp is returned.} + \item{BPPARAM}{A \linkS4class{BiocParallelParam} object indicating the type of parallelisation. See examples in \code{?tune.spca}.} @@ -107,6 +109,14 @@ components for the model fitted with \code{$choice.keepX} } \item{error.rate.class}{returns the error rate for each level of \code{Y} and for each component computed with the optimal keepX} +If test.keepX = FALSE,produces a matrix of classification +error rate estimation. The dimensions correspond to the components in the +model and to the prediction method used, respectively. Note that error rates +reported in any component include the performance of the model in earlier +components for the specified \code{keepX} parameters (e.g. error rate +reported for component 3 for \code{keepX = 20} already includes the fitted +model on components 1 and 2 for \code{keepX = 20}). + \item{predict}{Prediction values for each sample, each \code{test.keepX}, each comp and each repeat. Only if light.output=FALSE} \item{class}{Predicted class for each sample, each \code{test.keepX}, each @@ -120,8 +130,7 @@ between latent variables.} } \description{ Computes M-fold or Leave-One-Out Cross-Validation scores on a user-input -grid to determine optimal values for the sparsity parameters in -\code{splsda}. +grid to determine optimal values for the parameters in \code{splsda}. } \details{ This tuning function should be used to tune the parameters in the @@ -183,32 +192,47 @@ towards majority classes during the performance assessment. More details about the prediction distances in \code{?predict} and the supplemental material of the mixOmics article (Rohart et al. 2017). + +If test.keepX is set to NULL, the \code{perf()} function will be run internally, +which performs cross-validation to identify optimal number of components and +distance measure. Running tuning initially using \code{test.keepX = NULL} speeds +up the parameter tuning workflow, as then a lower ncomp value can be used for +variable selection tuning. } \examples{ ## First example: analysis with sPLS-DA - data(breast.tumors) X = breast.tumors$gene.exp Y = as.factor(breast.tumors$sample$treatment) -tune = tune.splsda(X, Y, ncomp = 1, nrepeat = 10, logratio = "none", -test.keepX = c(5, 10, 15), folds = 10, dist = "max.dist", -progressBar = TRUE) -\dontrun{ -# 5 components, optimising 'keepX' and 'ncomp' -tune = tune.splsda(X, Y, ncomp = 5, test.keepX = c(5, 10, 15), -folds = 10, dist = "max.dist", nrepeat = 5, progressBar = FALSE) +# first tune on components only +tune = tune.splsda(X, Y, ncomp = 5, logratio = "none", + nrepeat = 10, folds = 10, + test.keepX = NULL, + dist = "all", + progressBar = TRUE, + seed = 20) # set for reproducibility of example only +plot(tune) # optimal distance = centroids.dist +tune$choice.ncomp # optimal component number = 3 + +# then tune optimal keepX for each component +tune = tune.splsda(X, Y, ncomp = 3, logratio = "none", + nrepeat = 10, folds = 10, + test.keepX = c(5, 10, 15), dist = "centroids.dist", + progressBar = TRUE, + seed = 20) -tune$choice.ncomp -tune$choice.keepX plot(tune) - -## only tune component 3 and 4 -# keeping 5 and 10 variables on the first two components respectively - -tune = tune.splsda(X = X,Y = Y, ncomp = 4, -already.tested.X = c(5,10), -test.keepX = seq(1,10,2), progressBar = TRUE) +tune$choice.keepX # optimal number of variables to keep c(15, 5, 15) + +## With already tested variables: +tune = tune.splsda(X, Y, ncomp = 3, logratio = "none", + nrepeat = 10, folds = 10, + test.keepX = c(5, 10, 15), already.tested.X = c(5, 10), + dist = "centroids.dist", + progressBar = TRUE, + seed = 20) +plot(tune) ## Second example: multilevel one-factor analysis with sPLS-DA @@ -218,10 +242,17 @@ Y = vac18$stimulation # sample indicates the repeated measurements design = data.frame(sample = vac18$sample) +# tune on components +tune = tune.splsda(X, Y = Y, ncomp = 5, nrepeat = 10, logratio = "none", + test.keepX = NULL, folds = 10, dist = "max.dist", multilevel = design) + +plot(tune) + +# tune on variables tune = tune.splsda(X, Y = Y, ncomp = 3, nrepeat = 10, logratio = "none", -test.keepX = c(5,50,100),folds = 10, dist = "max.dist", multilevel = design) + test.keepX = c(5,50,100),folds = 10, dist = "max.dist", multilevel = design) -} +plot(tune) } \references{ mixOmics article: diff --git a/tests/testthat/test-tune.block.plsda.R b/tests/testthat/test-tune.block.plsda.R new file mode 100644 index 00000000..9ede1dd0 --- /dev/null +++ b/tests/testthat/test-tune.block.plsda.R @@ -0,0 +1,53 @@ +context("tune.block.plsda") +library(BiocParallel) + +test_that("tune.block.plsda works and is the same as perf alone and in tune wrapper", code = { + + data("breast.TCGA") + X <- list(mrna = breast.TCGA$data.train$mrna, mirna = breast.TCGA$data.train$mirna, protein = breast.TCGA$data.train$protein) + Y <- breast.TCGA$data.train$subtype + set.seed(100) + subset <- mixOmics:::stratified.subsampling(breast.TCGA$data.train$subtype, folds = 10)[[1]][[1]] + X <- lapply(X, function(omic) omic[subset,]) + Y <- Y[subset] + design = matrix(1, ncol = length(X), nrow = length(X), + dimnames = list(names(X), names(X))) + diag(design) = 0 + design + + # run tune.block.plsda + tune.res.1 <- suppressWarnings( + tune.block.plsda(X, Y, design = design, ncomp = 2, nrepeat = 1, folds = 3, + seed = 13, dist = c("all")) + ) + # run tune wrapper + tune.res.2 <- suppressWarnings( + tune(X, Y, design = design, ncomp = 2, nrepeat = 1, folds = 3, + seed = 13, dist = c("all"), method = "block.plsda") + ) + # run perf + model <- block.plsda(X, Y, design = design, ncomp = 2) + tune.res.3 <- suppressWarnings( + perf(model, nrepeat = 1, seed = 13, folds = 3) + ) + + # check outputs format + expect_equal(class(tune.res.1)[1], "perf.sgccda.mthd") + expect_equal(class(tune.res.2)[1], "perf.sgccda.mthd") + expect_equal(class(tune.res.3)[1], "perf.sgccda.mthd") + + # check output values + .expect_numerically_close(tune.res.1$error.rate.per.class$mrna$max.dist[3,2], 0.25) + .expect_numerically_close(tune.res.2$error.rate.per.class$mrna$max.dist[3,2], 0.25) + .expect_numerically_close(tune.res.3$error.rate.per.class$mrna$max.dist[3,2], 0.25) + + # check can plot outputs + pdf(NULL) + on.exit(dev.off()) + expect_silent(plot(tune.res.1)) + expect_silent(plot(tune.res.2)) + expect_silent(plot(tune.res.3)) + +}) + + \ No newline at end of file diff --git a/tests/testthat/test-tune.block.splsda.R b/tests/testthat/test-tune.block.splsda.R index 7e9b81bd..a5caf1f5 100644 --- a/tests/testthat/test-tune.block.splsda.R +++ b/tests/testthat/test-tune.block.splsda.R @@ -157,4 +157,52 @@ test_that("tune.block.splsda works independently and in tune wrapper the same", BPPARAM = SerialParam(), seed = 42 ) expect_equal(tune11$choice.keepX, tune41$choice.keepX) -}) \ No newline at end of file + + # check can plot outputs + pdf(NULL) + on.exit(dev.off()) + expect_silent(plot(tune41)) + expect_silent(plot(tune11)) + +}) + +test_that("tune.block.splsda works when test.keepX = NULL and gives same result as perf()", code = { + + data("breast.TCGA") + data = list( + mrna = breast.TCGA$data.train$mrna, + mirna = breast.TCGA$data.train$mirna, + protein = breast.TCGA$data.train$protein + ) + + design = 'full' + ncomp <- 2 + folds <- 3 + nrep <- 3 + test.keepX = list( + mrna = c(10, 20), + mirna = c(20, 30), + protein = c(3, 6) + ) + + set.seed(100) + subset <- mixOmics:::stratified.subsampling(breast.TCGA$data.train$subtype, folds = 4)[[1]][[1]] + X <- lapply(data, function(omic) omic[subset,]) + Y <- breast.TCGA$data.train$subtype[subset] + + # tune on components only + tune_res <- suppressWarnings( + tune.block.splsda(X, Y, ncomp = 3, design = design, + nrepeat = 1, folds = 3, + test.keepX = NULL, seed = 20) + ) + + # run perf + block.splsda_res <- block.splsda(X, Y, ncomp = 3, design = design) + perf_res <- suppressWarnings( + perf(block.splsda_res, ncomp = 3, nrepeat = 1, folds = 3, dist = "max.dist", seed = 20) + ) + + # check results match + expect_equal(tune_res$error.rate$mrna[1,1], perf_res$error.rate$mrna[1,1]) +}) diff --git a/tests/testthat/test-tune.mint.plsda.R b/tests/testthat/test-tune.mint.plsda.R new file mode 100644 index 00000000..d65b575b --- /dev/null +++ b/tests/testthat/test-tune.mint.plsda.R @@ -0,0 +1,46 @@ +context("tune.mint.plsda") +library(BiocParallel) + +test_that("tune.mint.plsda works and is the same perf alone and in tune wrapper", code = { + + # set up data + data(stemcells) + X = stemcells$gene + Y = stemcells$celltype + study = stemcells$study + + # run alone + tune.res.1 = suppressWarnings( + tune.mint.plsda(X, Y, study = study, ncomp = 2) + ) + + # run in wrapper + tune.res.2 = suppressWarnings( + tune(method = "mint.plsda", X, Y, study = study, ncomp = 2) + ) + + # run perf + model <- mint.plsda(X, Y, ncomp = 2, study = study) + tune.res.3 = suppressWarnings( + perf(model, ncomp = 2) + ) + + # check outputs format + expect_equal(class(tune.res.1)[1], "perf") + expect_equal(class(tune.res.2)[1], "perf") + expect_equal(class(tune.res.3)[1], "perf") + + # check outputs values + .expect_numerically_close(tune.res.1$global.error$BER[1,1], 0.3803556) + .expect_numerically_close(tune.res.2$global.error$BER[1,1], 0.3803556) + .expect_numerically_close(tune.res.3$global.error$BER[1,1], 0.3803556) + + # check can plot outputs + pdf(NULL) + on.exit(dev.off()) + expect_silent(plot(tune.res.1)) + expect_silent(plot(tune.res.2)) + expect_silent(plot(tune.res.3)) + + +}) \ No newline at end of file diff --git a/tests/testthat/test-tune.mint.splsda.R b/tests/testthat/test-tune.mint.splsda.R index 23858a20..8719f6bf 100644 --- a/tests/testthat/test-tune.mint.splsda.R +++ b/tests/testthat/test-tune.mint.splsda.R @@ -20,6 +20,11 @@ test_that("tune.mint.splsda works", code = { expect_is(out, "tune.mint.splsda") expect_equal(out$choice.ncomp$ncomp, 1) + # check can plot + pdf(NULL) + on.exit(dev.off()) + expect_error(plot(out), NA) # makes note about not being able to plot SD bars + }) test_that("tune.mint.splsda works with custom alpha", code = { @@ -43,4 +48,33 @@ test_that("tune.mint.splsda works with custom alpha", code = { expect_is(out, "tune.mint.splsda") expect_equal(out$choice.ncomp$ncomp, 1) + # check can plot + pdf(NULL) + on.exit(dev.off()) + expect_error(plot(out), NA) # makes note about not being able to plot SD bars + +}) + +test_that("tune.mint.splsda works when test.keepX = NULL and gives same result as perf()", code = { + + # set up data + data(stemcells) + X = stemcells$gene + Y = stemcells$celltype + study = stemcells$study + + # tune on components only + tune_res <- suppressWarnings( + tune.mint.splsda(X, Y, study = study, ncomp = 2, + test.keepX = NULL) + ) + + # run perf + mint.splsda_res <- mint.splsda(X, Y, study = study, ncomp = 2) + perf_res <- suppressWarnings( + perf(mint.splsda_res, ncomp = 2, dist = "max.dist") + ) + + # check results match + expect_equal(tune_res$global.error$BER[1,1], perf_res$global.error$BER[1,1]) }) diff --git a/tests/testthat/test-tune.pca.R b/tests/testthat/test-tune.pca.R new file mode 100644 index 00000000..77e9afac --- /dev/null +++ b/tests/testthat/test-tune.pca.R @@ -0,0 +1,16 @@ +context("tune.pca") + +test_that("tune.pca and tune(method='pca') are equivalent", { + data(srbct) + X <- srbct$gene[1:20, 1:200] + object1 <- tune.pca(X, ncomp = 2, center = TRUE, scale = TRUE) + object2 <- tune(method = "pca", X, ncomp = 2, center = TRUE, scale = TRUE) + # expect results the same + expect_equal(object1$ncomp, object2$ncomp) + expect_equal(object1$sdev[1], object2$sdev[1]) + # check can plot outputs without errors + pdf(NULL) + on.exit(dev.off()) + expect_silent(plot(object1)) + expect_silent(plot(object2)) +}) diff --git a/tests/testthat/test-tune.pls.R b/tests/testthat/test-tune.pls.R new file mode 100644 index 00000000..9b0fba35 --- /dev/null +++ b/tests/testthat/test-tune.pls.R @@ -0,0 +1,41 @@ +context("tune.pls") +library(BiocParallel) + +test_that("tune.pls works and is the same as perf alone and in tune wrapper", code = { + + # set up data + data(liver.toxicity) + X <- liver.toxicity$gene + Y <- liver.toxicity$clinic + + # independently + tune.res.1 <- tune.pls( X, Y, ncomp = 2, measure = "cor", logratio = "none", + folds = 5, nrepeat = 1, progressBar = FALSE, seed = 100) + + # in tune wrapper + tune.res.2 <- tune( X, Y, ncomp = 2, measure = "cor", logratio = "none", + folds = 5, nrepeat = 1, method = "pls", seed = 100) + + # with perf + model <- pls(X, Y, ncomp = 10, logratio = "none") + tune.res.3 <- perf(model, folds = 5, nrepeat = 1, seed = 100) + + # check outputs format + expect_equal(class(tune.res.1)[1], "perf.pls.mthd") + expect_equal(class(tune.res.2)[1], "perf.pls.mthd") + expect_equal(class(tune.res.3)[1], "perf.pls.mthd") + + # check output values + .expect_numerically_close(tune.res.1$measures$Q2$summary[1,3], -0.1304012) + .expect_numerically_close(tune.res.2$measures$Q2$summary[1,3], -0.1304012) + .expect_numerically_close(tune.res.3$measures$Q2$summary[1,3], -0.1304012) + + # check can plot outputs + pdf(NULL) + on.exit(dev.off()) + expect_silent(plot(tune.res.1)) + expect_silent(plot(tune.res.2)) + expect_silent(plot(tune.res.3)) + +}) + \ No newline at end of file diff --git a/tests/testthat/test-tune.plsda.R b/tests/testthat/test-tune.plsda.R new file mode 100644 index 00000000..9ee1a112 --- /dev/null +++ b/tests/testthat/test-tune.plsda.R @@ -0,0 +1,52 @@ +context("tune.plsda") +library(BiocParallel) + +test_that("tune.plsda works and is the same perf alone and in tune wrapper", code = { + + # set up data + data(breast.tumors) + X = breast.tumors$gene.exp + Y = as.factor(breast.tumors$sample$treatment) + + # run alone + tune.plsda.res.1 = suppressWarnings( + tune.splsda(X, Y, ncomp = 2, nrepeat = 1, logratio = "none", + folds = 2, + BPPARAM = SerialParam(), seed = 42) + ) + + # run in wrapper + tune.plsda.res.2 = suppressWarnings( + tune(X, Y, ncomp = 2, nrepeat = 1, logratio = "none", + folds = 2, + BPPARAM = SnowParam(workers = 2), seed = 42, + method = "plsda") + ) + + # run perf + model <- plsda(X, Y, ncomp = 2, logratio = "none") + tune.plsda.res.3 = suppressWarnings( + perf(model, ncomp = 2, nrepeat = 1, + folds = 2, + BPPARAM = SerialParam(), seed = 42, + method = "splsda") + ) + + + # check outputs format + expect_equal(class(tune.plsda.res.1)[1], "perf") + expect_equal(class(tune.plsda.res.2)[1], "perf") + expect_equal(class(tune.plsda.res.3)[1], "perf") + # check outputs values + .expect_numerically_close(tune.plsda.res.1$error.rate$overall[1,1], 0.5106383) + .expect_numerically_close(tune.plsda.res.2$error.rate$overall[1,1], 0.5106383) + .expect_numerically_close(tune.plsda.res.3$error.rate$overall[1,1], 0.5106383) + + # check can plot + pdf(NULL) + on.exit(dev.off()) + expect_silent(plot(tune.plsda.res.1)) + expect_silent(plot(tune.plsda.res.2)) + expect_silent(plot(tune.plsda.res.3)) + +}) \ No newline at end of file diff --git a/tests/testthat/test-tune.rcc.R b/tests/testthat/test-tune.rcc.R index 921d6d50..7cf83a5c 100644 --- a/tests/testthat/test-tune.rcc.R +++ b/tests/testthat/test-tune.rcc.R @@ -8,12 +8,17 @@ test_that("tune.rcc works with Mfold method", code = { Y <- nutrimouse$gene # run - tune.rcc.res <- tune.rcc(X, Y, validation = "Mfold", plot = FALSE, seed = 20) + tune.rcc.res <- tune.rcc(X, Y, validation = "Mfold", seed = 20) # check outputs expect_equal(class(tune.rcc.res), "tune.rcc") expect_equal(tune.rcc.res$opt.lambda1, 0.5005) expect_equal(tune.rcc.res$grid1, c(0.00100, 0.25075, 0.50050, 0.75025, 1.00000)) + + # check can plot + pdf(NULL) + on.exit(dev.off()) + expect_silent(plot(tune.rcc.res)) }) test_that("tune.rcc works with loo method", code = { @@ -23,12 +28,17 @@ test_that("tune.rcc works with loo method", code = { Y <- nutrimouse$gene # run - tune.rcc.res <- tune.rcc(X, Y, validation = "loo", plot = FALSE, seed = 20) + tune.rcc.res <- tune.rcc(X, Y, validation = "loo", seed = 20) # check outputs expect_equal(class(tune.rcc.res), "tune.rcc") expect_equal(tune.rcc.res$opt.lambda1, 0.25075) expect_equal(tune.rcc.res$grid1, c(0.00100, 0.25075, 0.50050, 0.75025, 1.00000)) + + # check can plot + pdf(NULL) + on.exit(dev.off()) + expect_silent(plot(tune.rcc.res)) }) test_that("tune.rcc works in parallel same as in series", code = { @@ -38,16 +48,22 @@ test_that("tune.rcc works in parallel same as in series", code = { Y <- nutrimouse$gene # run in series - tune.rcc.res <- tune.rcc(X, Y, validation = "Mfold", plot = FALSE, + tune.rcc.res <- tune.rcc(X, Y, validation = "Mfold", BPPARAM = SerialParam(RNGseed = NULL), seed = 12) # run in parallel - tune.rcc.res.parallel <- tune.rcc(X, Y, validation = "Mfold", plot = FALSE, + tune.rcc.res.parallel <- tune.rcc(X, Y, validation = "Mfold", BPPARAM = SnowParam(workers = 2, RNGseed = NULL), seed = 12) # check outputs expect_equal(class(tune.rcc.res), "tune.rcc") expect_equal(tune.rcc.res$opt.lambda2, tune.rcc.res.parallel$opt.lambda2) expect_equal(tune.rcc.res$grid2, tune.rcc.res.parallel$grid2) + + # check can plot + pdf(NULL) + on.exit(dev.off()) + expect_silent(plot(tune.rcc.res)) + expect_silent(plot(tune.rcc.res.parallel)) }) test_that("tune.rcc and tune(method='rcc') are equivalent", { @@ -58,7 +74,7 @@ test_that("tune.rcc and tune(method='rcc') are equivalent", { Y <- nutrimouse$gene # run independently - tune.rcc.res.1 <- tune.rcc(X, Y, validation = "Mfold", plot = FALSE, + tune.rcc.res.1 <- tune.rcc(X, Y, validation = "Mfold", BPPARAM = SerialParam(RNGseed = NULL), seed = 12, grid1 = c(0.001, 0.2, 0.6, 1), grid2 = c(0.001, 0.2, 0.6, 1)) @@ -75,5 +91,11 @@ test_that("tune.rcc and tune(method='rcc') are equivalent", { expect_equal(tune.rcc.res.1$opt.lambda2, tune.rcc.res.2$opt.lambda2) expect_equal(tune.rcc.res.1$grid2, tune.rcc.res.2$grid2) + # check can plot + pdf(NULL) + on.exit(dev.off()) + expect_silent(plot(tune.rcc.res.1)) + expect_silent(plot(tune.rcc.res.2)) + }) diff --git a/tests/testthat/test-tune.spca.R b/tests/testthat/test-tune.spca.R index dc8559df..d4fd6c75 100644 --- a/tests/testthat/test-tune.spca.R +++ b/tests/testthat/test-tune.spca.R @@ -19,6 +19,11 @@ test_that("tune.spca works in serial and parallel", { expect_equal(object_parallel$choice.keepX[[1]], 35) expect_equal(object_parallel$choice.keepX[[2]], 5) .expect_numerically_close(object_parallel$cor.comp$comp1[1,2], 0.3994544) + # test can plot outputs + pdf(NULL) + on.exit(dev.off()) + expect_silent(plot(object_serial)) + expect_silent(plot(object_parallel)) }) test_that("tune.spca same result in serial and parallel", { @@ -77,5 +82,10 @@ test_that("tune.spca and tune(method='spca') are equivalent", { expect_equal(object1$choice.keepX[[1]], object2$choice.keepX[[1]]) expect_equal(object1$choice.keepX[[2]], object2$choice.keepX[[2]]) .expect_numerically_close(object1$cor.comp$comp1[3,3], object2$cor.comp$comp1[3,3]) + # test can plot outputs + pdf(NULL) + on.exit(dev.off()) + expect_silent(plot(object1)) + expect_silent(plot(object2)) }) \ No newline at end of file diff --git a/tests/testthat/test-tune.spls.R b/tests/testthat/test-tune.spls.R index c990e9c0..ddd860c8 100644 --- a/tests/testthat/test-tune.spls.R +++ b/tests/testthat/test-tune.spls.R @@ -10,24 +10,27 @@ test_that("tune.spls works and is the same in parallel and when run in tune wrap # run in serial tune.spls.res.1 = suppressWarnings(tune.spls(X, Y, ncomp = 2, - test.keepX = seq(5, 10, 5), + test.keepX = seq(5, 10, 5), test.keepY = seq(3, 6, 3), measure = "cor", + logratio = "none", mode = "regression", folds = 2, nrepeat = 1, progressBar = FALSE, BPPARAM = SerialParam(RNGseed = 5), # RNGseed is ignored seed = 5212)) # run in parallel tune.spls.res.2 = suppressWarnings(tune.spls(X, Y, ncomp = 2, - test.keepX = seq(5, 10, 5), + test.keepX = seq(5, 10, 5), test.keepY = seq(3, 6, 3), measure = "cor", + logratio = "none", mode = "regression", folds = 2, nrepeat = 1, progressBar = FALSE, BPPARAM = SnowParam(workers = 2), seed = 5212)) # in tune wrapper in serial tune.spls.res.3 = suppressWarnings(tune(X, Y, ncomp = 2, - test.keepX = seq(5, 10, 5), + test.keepX = seq(5, 10, 5), test.keepY = seq(3, 6, 3), measure = "cor", + logratio = "none", mode = "regression", folds = 2, nrepeat = 1, progressBar = FALSE, BPPARAM = SerialParam(RNGseed = NULL), seed = 5212, @@ -36,8 +39,9 @@ test_that("tune.spls works and is the same in parallel and when run in tune wrap # in tune wrapper in parallel tune.spls.res.4 = suppressWarnings(tune(X, Y, ncomp = 2, - test.keepX = seq(5, 10, 5), + test.keepX = seq(5, 10, 5), test.keepY = seq(3, 6, 3), measure = "cor", + logratio = "none", mode = "regression", folds = 2, nrepeat = 1, progressBar = FALSE, BPPARAM = SnowParam(workers = 2), method = "spls", @@ -57,9 +61,15 @@ test_that("tune.spls works and is the same in parallel and when run in tune wrap expect_equal(unname(tune.spls.res.4$choice.keepY), c(3,6)) # check outputs exactly the same regardless of how the function was run - expect_equal(tune.spls.res.1$measure.pred$mean, tune.spls.res.2$measure.pred$mean) - expect_equal(tune.spls.res.1$measure.pred$mean, tune.spls.res.3$measure.pred$mean) - expect_equal(tune.spls.res.1$measure.pred$mean, tune.spls.res.4$measure.pred$mean) + .expect_numerically_close(tune.spls.res.1$measure.pred$mean, tune.spls.res.2$measure.pred$mean) + .expect_numerically_close(tune.spls.res.1$measure.pred$mean, tune.spls.res.3$measure.pred$mean) + .expect_numerically_close(tune.spls.res.1$measure.pred$mean, tune.spls.res.4$measure.pred$mean) + + # check can plot outputs + expect_silent(plot(tune.spls.res.1)) + expect_silent(plot(tune.spls.res.2)) + expect_silent(plot(tune.spls.res.3)) + expect_silent(plot(tune.spls.res.4)) }) ## If ncol(Y) == 1 tune.spls calls tune.spls1 @@ -114,3 +124,62 @@ test_that("tune.spls.1 works and is the same in parallel and when run in tune wr expect_equal(tune.spls1.MAE.1$measure.pred$mean, tune.spls1.MAE.3$measure.pred$mean) expect_equal(tune.spls1.MAE.1$measure.pred$mean, tune.spls1.MAE.4$measure.pred$mean) }) + +test_that("tune.spls works when test.keepX and test.keepY = NULL and gives same result as perf()", code = { + + # set up data + data(liver.toxicity) + X <- liver.toxicity$gene + Y <- liver.toxicity$clinic + + # tune on components only + tune_res <- suppressWarnings( + tune.spls(X, Y, ncomp = 2, logratio = "none", + nrepeat = 1, folds = 3, measure = "cor", + test.keepX = NULL, test.keepY = NULL, seed = 20) + ) + + # run perf + spls_res <- spls(X, Y, ncomp = 2, logratio = "none") + perf_res <- suppressWarnings( + perf(spls_res, ncomp = 2, nrepeat = 1, folds = 3, seed = 20) + ) + + # check results match + expect_equal(tune_res$measures$MSEP$summary[1,3], perf_res$measures$MSEP$summary[1,3]) +}) + +test_that("tune.spls works in different test cases", code = { + + # set up data + data(liver.toxicity) + X <- liver.toxicity$gene + Y <- liver.toxicity$clinic + list.keepX <- c(5:10, seq(15, 50, 5)) + + # near.zero.var = TRUE + expect_no_error( + tune.spls(X, Y, ncomp = 2, logratio = "none", near.zero.var = TRUE, + nrepeat = 1, folds = 3, measure = "cor", + test.keepX = NULL, test.keepY = NULL, seed = 20) + ) + expect_no_error( + tune.spls(X, Y, ncomp = 2, logratio = "none", near.zero.var = TRUE, + nrepeat = 1, folds = 3, measure = "cor", + test.keepX = list.keepX, test.keepY = NULL, seed = 20) + ) + + # canonical mode + expect_no_error( + tune.spls(X, Y, ncomp = 2, logratio = "none", near.zero.var = TRUE, + nrepeat = 1, folds = 3, measure = "cor", mode = "canonical", + test.keepX = NULL, test.keepY = NULL, seed = 20) + ) + expect_no_error( + tune.spls(X, Y, ncomp = 2, logratio = "none", near.zero.var = TRUE, + nrepeat = 1, folds = 3, measure = "cor", mode = "canonical", + test.keepX = list.keepX, test.keepY = NULL, seed = 20) + ) +}) + + diff --git a/tests/testthat/test-tune.splsda.R b/tests/testthat/test-tune.splsda.R index 7ad95d23..3116f99f 100644 --- a/tests/testthat/test-tune.splsda.R +++ b/tests/testthat/test-tune.splsda.R @@ -1,7 +1,7 @@ context("tune.splsda") library(BiocParallel) -test_that("tune.spls works and is the same in parallel and when run in tune wrapper", code = { +test_that("tune.splsda works and is the same in parallel and when run in tune wrapper", code = { # set up data data(breast.tumors) @@ -31,11 +31,13 @@ test_that("tune.spls works and is the same in parallel and when run in tune wrap method = "splsda") - # check outputs + # check outputs format expect_equal(class(tune.splsda.res.1), "tune.splsda") expect_equal(class(tune.splsda.res.2), "tune.splsda") expect_equal(class(tune.splsda.res.3), "tune.splsda") expect_equal(class(tune.splsda.res.4), "tune.splsda") + + # check output values expect_equal(unname(tune.splsda.res.1$choice.keepX), c(10,15)) expect_equal(unname(tune.splsda.res.2$choice.keepX), c(10,15)) expect_equal(unname(tune.splsda.res.3$choice.keepX), c(10,15)) @@ -45,4 +47,34 @@ test_that("tune.spls works and is the same in parallel and when run in tune wrap .expect_numerically_close(tune.splsda.res.3$error.rate[1,1], 0.3111111) .expect_numerically_close(tune.splsda.res.4$error.rate[1,1], 0.3111111) + # check can plot outputs + expect_is(plot(tune.splsda.res.1), "ggplot") + expect_is(plot(tune.splsda.res.2), "ggplot") + expect_is(plot(tune.splsda.res.3), "ggplot") + expect_is(plot(tune.splsda.res.4), "ggplot") + +}) + +test_that("tune.splsda works when test.keepX = NULL and gives same result as perf()", code = { + + # set up data + data(breast.tumors) + X <- breast.tumors$gene.exp + Y <- as.factor(breast.tumors$sample$treatment) + + # tune on components only + tune_res <- suppressWarnings( + tune.splsda(X, Y, ncomp = 2, logratio = "none", + nrepeat = 1, folds = 3, + test.keepX = NULL, seed = 20) + ) + + # run perf + splsda_res <- splsda(X, Y, ncomp = 2) + perf_res <- suppressWarnings( + perf(splsda_res, ncomp = 2, nrepeat = 1, folds = 3, seed = 20) + ) + + # check results match + expect_equal(tune_res$error.rate$overall[2,1], perf_res$error.rate$overall[2,1]) })