From 48db6abec6c6ffbb0c3cf8b6ef654011496ab6d6 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Fri, 22 Nov 2024 13:41:49 +1100 Subject: [PATCH 01/75] update tune.splsda so runs perf if test.keepX=NULL, update docs and exampels --- R/tune.splsda.R | 592 ++++++++++++++++---------------- examples/tune.splsda-examples.R | 46 +++ man/tune.splsda.Rd | 59 ++-- 3 files changed, 371 insertions(+), 326 deletions(-) create mode 100644 examples/tune.splsda-examples.R diff --git a/R/tune.splsda.R b/R/tune.splsda.R index 7f7ddcd6..00af63e6 100644 --- a/R/tune.splsda.R +++ b/R/tune.splsda.R @@ -1,13 +1,12 @@ # ======================================================================================================== -# tune.splsda: chose the optimal number of parameters per component on a splsda method +# tune.splsda: tuning hyperparameters on a splsda method # ======================================================================================================== #' Tuning functions for sPLS-DA method #' #' Computes M-fold or Leave-One-Out Cross-Validation scores on a user-input -#' grid to determine optimal values for the sparsity parameters in -#' \code{splsda}. -#' +#' grid to determine optimal values for the parameters in \code{splsda}. +#' #' #' This tuning function should be used to tune the parameters in the #' \code{splsda} function (number of components and number of variables in @@ -69,12 +68,19 @@ #' More details about the prediction distances in \code{?predict} and the #' supplemental material of the mixOmics article (Rohart et al. 2017). #' +#' If test.keepX is set to NULL, the \code{perf()} function will be run internally, +#' which performs cross-validation to identify optimal number of components and +#' distance measure. Running tuning initially using \code{test.keepX = NULL} speeds +#' up the parameter tuning workflow, as then a lower ncomp value can be used for +#' variable selection tuning. +#' #' @param X numeric matrix of predictors. \code{NA}s are allowed. #' @param Y \code{if(method = 'spls')} numeric vector or matrix of continuous #' responses (for multi-response models) \code{NA}s are allowed. #' @param ncomp the number of components to include in the model. #' @param test.keepX numeric vector for the different number of variables to -#' test from the \eqn{X} data set +#' test from the \eqn{X} data set. If set to NULL, tuning will be performed on ncomp +#' using all variables in the \eqn{X} data set. #' @param already.tested.X Optional, if \code{ncomp > 1} A numeric vector #' indicating the number of variables to select from the \eqn{X} data set on #' the firsts components. @@ -143,52 +149,16 @@ #' e1005752 #' @keywords regression multivariate #' @export -#' @examples -#' ## First example: analysis with sPLS-DA -#' -#' data(breast.tumors) -#' X = breast.tumors$gene.exp -#' Y = as.factor(breast.tumors$sample$treatment) -#' tune = tune.splsda(X, Y, ncomp = 1, nrepeat = 10, logratio = "none", -#' test.keepX = c(5, 10, 15), folds = 10, dist = "max.dist", -#' progressBar = TRUE) -#' -#' \dontrun{ -#' # 5 components, optimising 'keepX' and 'ncomp' -#' tune = tune.splsda(X, Y, ncomp = 5, test.keepX = c(5, 10, 15), -#' folds = 10, dist = "max.dist", nrepeat = 5, progressBar = FALSE) -#' -#' tune$choice.ncomp -#' tune$choice.keepX -#' plot(tune) -#' -#' ## only tune component 3 and 4 -#' # keeping 5 and 10 variables on the first two components respectively -#' -#' tune = tune.splsda(X = X,Y = Y, ncomp = 4, -#' already.tested.X = c(5,10), -#' test.keepX = seq(1,10,2), progressBar = TRUE) -#' -#' ## Second example: multilevel one-factor analysis with sPLS-DA -#' -#' data(vac18) -#' X = vac18$genes -#' Y = vac18$stimulation -#' # sample indicates the repeated measurements -#' design = data.frame(sample = vac18$sample) -#' -#' tune = tune.splsda(X, Y = Y, ncomp = 3, nrepeat = 10, logratio = "none", -#' test.keepX = c(5,50,100),folds = 10, dist = "max.dist", multilevel = design) -#' -#' } +#' @example ./examples/tune.splsda-examples.R + tune.splsda <- function (X, Y, ncomp = 1, - test.keepX = c(5, 10, 15), + test.keepX = NULL, # in which case perf() is run internally to only tune ncomp already.tested.X, validation = "Mfold", folds = 10, - dist = "max.dist", + dist = NULL, measure = "BER", # one of c("overall","BER") scale = TRUE, auc = FALSE, @@ -204,11 +174,11 @@ tune.splsda <- BPPARAM = SerialParam(), seed = NULL ) - { #-- checking general input parameters --------------------------------------# + { #-- checking general input parameters --------------------------------------# #---------------------------------------------------------------------------# - BPPARAM$RNGseed <- seed - set.seed(seed) + BPPARAM$RNGseed <- seed + set.seed(seed) #-- check significance threshold signif.threshold <- .check_alpha(signif.threshold) @@ -281,12 +251,11 @@ tune.splsda <- if (nrepeat < 3 & ncomp > 1) message("Note that the number of components cannot be reliably tuned with nrepeat < 3 or validaion = 'loo'.") + #-- logratio logratio <- match.arg(logratio) - - #if ((!is.null(already.tested.X)) && (length(already.tested.X) != (ncomp - 1)) ) - #stop("The number of already tested parameters should be NULL or ", ncomp - 1, " since you set ncomp = ", ncomp) - + + #--already.tested.X if (missing(already.tested.X)) { already.tested.X = NULL @@ -311,295 +280,310 @@ tune.splsda <- if (any(is.na(validation)) || length(validation) > 1) stop("'validation' should be one of 'Mfold' or 'loo'.", call. = FALSE) + + #-- test.keepX + #-> if test.keepX set to NULL run perf() i.e. parameter tuning of just ncomp + if (is.null(test.keepX)){ + print("test.keepX set to NULL, tuning only for number of components...") + splsda_res <- splsda(X, Y, ncomp) + perf_res <- perf(splsda_res, + validation = validation, folds = folds, nrepeat = nrepeat, + dist = dist, signif.threshold = signif.threshold, + BPPARAM = BPPARAM, seed = seed, progressBar = progressBar, + auc = auc) + return(perf_res) + #-> if test.keepX is not NULL, run tune function as before + } else { - #-- test.keepX - if (is.null(test.keepX) | length(test.keepX) == 1 | !is.numeric(test.keepX)) - stop("'test.keepX' must be a numeric vector with more than two entries", call. = FALSE) - - # remove some test.keepX if needed - if (any(test.keepX > ncol(X))){ - test.keepX = test.keepX[-which(test.keepX>ncol(X))] - if (length(test.keepX) < 2) - stop("Some entries of 'test.keepX' were higher than the number of - variables in 'X' and were removed, the resulting 'test.keepX' has now - too few entries (<2)", call. = FALSE) - } - - # add colnames and rownames if missing - X.names = dimnames(X)[[2]] - if (is.null(X.names)) - { - X.names = paste0("X", 1:ncol(X)) - dimnames(X)[[2]] = X.names - } - - ind.names = dimnames(X)[[1]] - if (is.null(ind.names)) - { - ind.names = 1:nrow(X) - rownames(X) = ind.names - } - - if (length(unique(rownames(X))) != nrow(X)) - stop("samples should have a unique identifier/rowname") - if (length(unique(X.names)) != ncol(X)) - stop("Unique indentifier is needed for the columns of X") - - rm(X.names);rm(ind.names) - - #-- end checking --# - #------------------# - - - #---------------------------------------------------------------------------# - #-- logration + multilevel approach ----------------------------------------# - # we can do logratio and multilevel on the whole data as these transformation are done per sample - X = logratio.transfo(X = X, logratio = logratio) - - if (!is.null(multilevel)) # logratio is applied per sample, multilevel as well, so both can be done on the whole data - { + #-- test.keepX + if (length(test.keepX) == 1 | !is.numeric(test.keepX)) + stop("'test.keepX' must be a numeric vector with more than two entries", call. = FALSE) - Xw = withinVariation(X, design = multilevel) - X = Xw + # remove some test.keepX if needed + if (any(test.keepX > ncol(X))){ + test.keepX = test.keepX[-which(test.keepX>ncol(X))] + if (length(test.keepX) < 2) + stop("Some entries of 'test.keepX' were higher than the number of + variables in 'X' and were removed, the resulting 'test.keepX' has now + too few entries (<2)", call. = FALSE) + } - #-- Need to set Y variable for 1 or 2 factors - Y = multilevel[, -1, drop=FALSE] - if (ncol(Y) >= 1) - Y = apply(Y, 1, paste, collapse = ".") # paste is to combine in the case we have 2 levels + # add colnames and rownames if missing + X.names = dimnames(X)[[2]] + if (is.null(X.names)) + { + X.names = paste0("X", 1:ncol(X)) + dimnames(X)[[2]] = X.names + } - Y = as.factor(Y) - } - #-- logration + multilevel approach ----------------------------------------# - #---------------------------------------------------------------------------# - - - #---------------------------------------------------------------------------# - #-- NA calculation ----------------------------------------------------# - - misdata = c(X=anyNA(X), Y=FALSE) # Detection of missing data. we assume no missing values in the factor Y - - if (any(misdata)) - { - is.na.A = is.na(X) + ind.names = dimnames(X)[[1]] + if (is.null(ind.names)) + { + ind.names = 1:nrow(X) + rownames(X) = ind.names + } - #ind.NA = which(apply(is.na.A, 1, sum) > 0) # calculated only once - #ind.NA.col = which(apply(is.na.A, 2, sum) >0) # indice of the col that have missing values. used in the deflation - } else { - is.na.A = NULL - #ind.NA = ind.NA.col = NULL - } - #-- NA calculation ----------------------------------------------------# - #---------------------------------------------------------------------------# - - - - test.keepX = sort(unique(test.keepX)) #sort test.keepX so as to be sure to chose the smallest in case of several minimum - names(test.keepX) = test.keepX - # if some components have already been tuned (eg comp1 and comp2), we're only tuning the following ones (comp3 comp4 .. ncomp) - if ((!is.null(already.tested.X))) - { - comp.real = (length(already.tested.X) + 1):ncomp - } else { - comp.real = 1:ncomp - } - - - choices = c("all", "max.dist", "centroids.dist", "mahalanobis.dist") - dist = match.arg(dist, choices, several.ok = TRUE) - - - mat.error.rate = list() - error.per.class = list() - AUC = list() - - if(nrepeat>1 & all(measure != "AUC")){ - mat.sd.error = matrix(0,nrow = length(test.keepX), ncol = ncomp-length(already.tested.X), - dimnames = list(c(test.keepX), c(paste0('comp', comp.real)))) - } else{ - mat.sd.error=NULL - } - mat.mean.error = matrix(nrow = length(test.keepX), ncol = ncomp-length(already.tested.X), - dimnames = list(c(test.keepX), c(paste0('comp', comp.real)))) - - # first: near zero var on the whole data set - if(near.zero.var == TRUE) - { - nzv = nearZeroVar(X) - if (length(nzv$Position > 0)) + if (length(unique(rownames(X))) != nrow(X)) + stop("samples should have a unique identifier/rowname") + if (length(unique(X.names)) != ncol(X)) + stop("Unique indentifier is needed for the columns of X") + + rm(X.names);rm(ind.names) + + #-- end checking --# + #------------------# + + + #---------------------------------------------------------------------------# + #-- logration + multilevel approach ----------------------------------------# + # we can do logratio and multilevel on the whole data as these transformation are done per sample + X = logratio.transfo(X = X, logratio = logratio) + + if (!is.null(multilevel)) # logratio is applied per sample, multilevel as well, so both can be done on the whole data { - warning("Zero- or near-zero variance predictors.\nReset predictors matrix to not near-zero variance predictors.\nSee $nzv for problematic predictors.") - X = X[, -nzv$Position, drop=TRUE] - if(ncol(X)==0) - stop("No more predictors after Near Zero Var has been applied!") + Xw = withinVariation(X, design = multilevel) + X = Xw + + #-- Need to set Y variable for 1 or 2 factors + Y = multilevel[, -1, drop=FALSE] + if (ncol(Y) >= 1) + Y = apply(Y, 1, paste, collapse = ".") # paste is to combine in the case we have 2 levels + Y = as.factor(Y) } - } - - if (is.null(multilevel) | (!is.null(multilevel) && ncol(multilevel) == 2)) - { - if(light.output == FALSE) - prediction.all = class.all = list() + #-- logration + multilevel approach ----------------------------------------# + #---------------------------------------------------------------------------# + + + #---------------------------------------------------------------------------# + #-- NA calculation ----------------------------------------------------# - if(auc) + misdata = c(X=anyNA(X), Y=FALSE) # Detection of missing data. we assume no missing values in the factor Y + + if (any(misdata)) { - auc.mean.sd=list() - if(light.output == FALSE) - auc.all=list() + is.na.A = is.na(X) + + #ind.NA = which(apply(is.na.A, 1, sum) > 0) # calculated only once + #ind.NA.col = which(apply(is.na.A, 2, sum) >0) # indice of the col that have missing values. used in the deflation + } else { + is.na.A = NULL + #ind.NA = ind.NA.col = NULL } + #-- NA calculation ----------------------------------------------------# + #---------------------------------------------------------------------------# - class.object=c("mixo_splsda","DA") - error.per.class.keepX.opt = list() - - if(all(measure != "AUC")){ - error.per.class.keepX.opt.mean = matrix(0, nrow = nlevels(Y), ncol = length(comp.real), - dimnames = list(c(levels(Y)), c(paste0('comp', comp.real)))) + + + test.keepX = sort(unique(test.keepX)) #sort test.keepX so as to be sure to chose the smallest in case of several minimum + names(test.keepX) = test.keepX + # if some components have already been tuned (eg comp1 and comp2), we're only tuning the following ones (comp3 comp4 .. ncomp) + if ((!is.null(already.tested.X))) + { + comp.real = (length(already.tested.X) + 1):ncomp } else { - error.per.class.keepX.opt.mean=NULL + comp.real = 1:ncomp + } + + + choices = c("all", "max.dist", "centroids.dist", "mahalanobis.dist") + dist = match.arg(dist, choices, several.ok = TRUE) + + + mat.error.rate = list() + error.per.class = list() + AUC = list() + + if(nrepeat>1 & all(measure != "AUC")){ + mat.sd.error = matrix(0,nrow = length(test.keepX), ncol = ncomp-length(already.tested.X), + dimnames = list(c(test.keepX), c(paste0('comp', comp.real)))) + } else{ + mat.sd.error=NULL } - # successively tune the components until ncomp: comp1, then comp2, ... - for(comp in 1:length(comp.real)) + mat.mean.error = matrix(nrow = length(test.keepX), ncol = ncomp-length(already.tested.X), + dimnames = list(c(test.keepX), c(paste0('comp', comp.real)))) + + # first: near zero var on the whole data set + if(near.zero.var == TRUE) { - - if (progressBar == TRUE & is(BPPARAM, 'SerialParam')) - cat("\ncomp",comp.real[comp], "\n") - - result = MCVfold.spls(X, Y, multilevel = multilevel, validation = validation, folds = folds, nrepeat = nrepeat, ncomp = 1 + length(already.tested.X), - choice.keepX = already.tested.X, - test.keepX = test.keepX, test.keepY = nlevels(Y), measure = measure, dist = dist, scale=scale, - near.zero.var = near.zero.var, progressBar = progressBar, tol = tol, max.iter = max.iter, auc = auc, - BPPARAM = BPPARAM, - misdata = misdata, is.na.A = is.na.A, class.object=class.object) - - # in the following, there is [[1]] because 'tune' is working with only 1 distance and 'MCVfold.spls' can work with multiple distances - mat.error.rate[[comp]] = result[[measure]]$mat.error.rate[[1]] - mat.mean.error[, comp]=result[[measure]]$error.rate.mean[[1]] - if (!is.null(result[[measure]]$error.rate.sd[[1]])) - mat.sd.error[, comp]=result[[measure]]$error.rate.sd[[1]] - - # confusion matrix for keepX.opt - if (all(measure!="AUC")){ - error.per.class.keepX.opt[[comp]]=result[[measure]]$confusion[[1]] - error.per.class.keepX.opt.mean[, comp]=apply(result[[measure]]$confusion[[1]], 1, mean) - } - - # best keepX - already.tested.X = c(already.tested.X, result[[measure]]$keepX.opt[[1]]) - - if(light.output == FALSE) + nzv = nearZeroVar(X) + if (length(nzv$Position > 0)) { - #prediction of each samples for each fold and each repeat, on each comp - class.all[[comp]] = result$class.comp[[1]] - prediction.all[[comp]] = result$prediction.comp + warning("Zero- or near-zero variance predictors.\nReset predictors matrix to not near-zero variance predictors.\nSee $nzv for problematic predictors.") + X = X[, -nzv$Position, drop=TRUE] + + if(ncol(X)==0) + stop("No more predictors after Near Zero Var has been applied!") + } + } + + if (is.null(multilevel) | (!is.null(multilevel) && ncol(multilevel) == 2)) + { + if(light.output == FALSE) + prediction.all = class.all = list() if(auc) { - auc.mean.sd[[comp]] = result$auc + auc.mean.sd=list() if(light.output == FALSE) - auc.all[[comp]] = result$auc.all + auc.all=list() } - } # end comp - - names(mat.error.rate) = c(paste0('comp', comp.real)) - if (all(measure!="AUC")) - names(error.per.class.keepX.opt) = c(paste0('comp', comp.real)) - - names(already.tested.X) = c(paste0('comp', 1:ncomp)) - - if (progressBar == TRUE) - cat('\n') - - # calculating the number of optimal component based on t.tests and the error.rate.all, if more than 3 error.rates(repeat>3) - if(nrepeat > 2 & length(comp.real) >1 & all(measure!="AUC")) - { - keepX = already.tested.X - error.keepX = NULL + class.object=c("mixo_splsda","DA") + error.per.class.keepX.opt = list() + + if(all(measure != "AUC")){ + error.per.class.keepX.opt.mean = matrix(0, nrow = nlevels(Y), ncol = length(comp.real), + dimnames = list(c(levels(Y)), c(paste0('comp', comp.real)))) + } else { + error.per.class.keepX.opt.mean=NULL + } + # successively tune the components until ncomp: comp1, then comp2, ... for(comp in 1:length(comp.real)) { - ind.row = match(keepX[[comp.real[comp]]],test.keepX) - error.keepX = cbind(error.keepX, mat.error.rate[[comp]][ind.row,]) + + if (progressBar == TRUE & is(BPPARAM, 'SerialParam')) + cat("\ncomp",comp.real[comp], "\n") + + result = MCVfold.spls(X, Y, multilevel = multilevel, validation = validation, folds = folds, nrepeat = nrepeat, ncomp = 1 + length(already.tested.X), + choice.keepX = already.tested.X, + test.keepX = test.keepX, test.keepY = nlevels(Y), measure = measure, dist = dist, scale=scale, + near.zero.var = near.zero.var, progressBar = progressBar, tol = tol, max.iter = max.iter, auc = auc, + BPPARAM = BPPARAM, + misdata = misdata, is.na.A = is.na.A, class.object=class.object) + + # in the following, there is [[1]] because 'tune' is working with only 1 distance and 'MCVfold.spls' can work with multiple distances + mat.error.rate[[comp]] = result[[measure]]$mat.error.rate[[1]] + mat.mean.error[, comp]=result[[measure]]$error.rate.mean[[1]] + if (!is.null(result[[measure]]$error.rate.sd[[1]])) + mat.sd.error[, comp]=result[[measure]]$error.rate.sd[[1]] + + # confusion matrix for keepX.opt + if (all(measure!="AUC")){ + error.per.class.keepX.opt[[comp]]=result[[measure]]$confusion[[1]] + error.per.class.keepX.opt.mean[, comp]=apply(result[[measure]]$confusion[[1]], 1, mean) + } + + # best keepX + already.tested.X = c(already.tested.X, result[[measure]]$keepX.opt[[1]]) + + if(light.output == FALSE) + { + #prediction of each samples for each fold and each repeat, on each comp + class.all[[comp]] = result$class.comp[[1]] + prediction.all[[comp]] = result$prediction.comp + } + + if(auc) + { + auc.mean.sd[[comp]] = result$auc + if(light.output == FALSE) + auc.all[[comp]] = result$auc.all + } + + } # end comp + + names(mat.error.rate) = c(paste0('comp', comp.real)) + if (all(measure!="AUC")) + names(error.per.class.keepX.opt) = c(paste0('comp', comp.real)) + + names(already.tested.X) = c(paste0('comp', 1:ncomp)) + + if (progressBar == TRUE) + cat('\n') + + # calculating the number of optimal component based on t.tests and the error.rate.all, if more than 3 error.rates(repeat>3) + if(nrepeat > 2 & length(comp.real) >1 & all(measure!="AUC")) + { + keepX = already.tested.X + error.keepX = NULL + for(comp in 1:length(comp.real)) + { + ind.row = match(keepX[[comp.real[comp]]],test.keepX) + error.keepX = cbind(error.keepX, mat.error.rate[[comp]][ind.row,]) + } + colnames(error.keepX) = c(paste0('comp', comp.real)) + + opt = t.test.process(error.keepX) + + ncomp_opt = comp.real[opt] + } else if(nrepeat > 2 & length(comp.real) >1 & all(measure=="AUC")){ + #hacking t.test.process for AUC + keepX = already.tested.X + error.keepX = NULL + for(comp in 1:length(comp.real)) + { + ind.row = match(keepX[[comp.real[comp]]],test.keepX) + error.keepX = cbind(error.keepX, mat.error.rate[[comp]][ind.row,]) + } + colnames(error.keepX) = c(paste0('comp', comp.real)) + + #1- to change the AUC to an 'error' and check for decrease + opt = t.test.process(1-error.keepX, alpha = signif.threshold) + + ncomp_opt = comp.real[opt] + } else { + ncomp_opt = error.keepX = NULL } - colnames(error.keepX) = c(paste0('comp', comp.real)) - opt = t.test.process(error.keepX) + result = list( + error.rate = mat.mean.error, + error.rate.sd = mat.sd.error, + error.rate.all = mat.error.rate, + choice.keepX = already.tested.X, + choice.ncomp = list(ncomp = ncomp_opt, values = error.keepX), + error.rate.class = error.per.class.keepX.opt.mean, + error.rate.class.all = error.per.class.keepX.opt) - ncomp_opt = comp.real[opt] - } else if(nrepeat > 2 & length(comp.real) >1 & all(measure=="AUC")){ - #hacking t.test.process for AUC - keepX = already.tested.X - error.keepX = NULL - for(comp in 1:length(comp.real)) + if(light.output == FALSE) { - ind.row = match(keepX[[comp.real[comp]]],test.keepX) - error.keepX = cbind(error.keepX, mat.error.rate[[comp]][ind.row,]) + names(class.all) = names(prediction.all) = c(paste0('comp', comp.real)) + result$predict = prediction.all + result$class = class.all } - colnames(error.keepX) = c(paste0('comp', comp.real)) + if(auc) + { + + #we add the AUC outputs only if it was not the measure + #otherwise there is twice the same output + if(all(measure != "AUC")){ + names(auc.mean.sd) = c(paste0('comp', comp.real)) + result$auc = auc.mean.sd + } + if(light.output == FALSE) + { + names(auc.all) = c(paste0('comp', comp.real)) + result$auc.all =auc.all + } + } + result$measure = measure + result$call = match.call() + result$call$nrepeat = nrepeat + result$nrepeat = nrepeat - #1- to change the AUC to an 'error' and check for decrease - opt = t.test.process(1-error.keepX, alpha = signif.threshold) + class(result) = "tune.splsda" - ncomp_opt = comp.real[opt] + return(result) } else { - ncomp_opt = error.keepX = NULL - } - - result = list( - error.rate = mat.mean.error, - error.rate.sd = mat.sd.error, - error.rate.all = mat.error.rate, - choice.keepX = already.tested.X, - choice.ncomp = list(ncomp = ncomp_opt, values = error.keepX), - error.rate.class = error.per.class.keepX.opt.mean, - error.rate.class.all = error.per.class.keepX.opt) - - if(light.output == FALSE) - { - names(class.all) = names(prediction.all) = c(paste0('comp', comp.real)) - result$predict = prediction.all - result$class = class.all - } - if(auc) - { + # if multilevel with 2 factors, we can not do as before because withinvariation depends on the factors, we maximase a correlation + message("For a two-factor analysis, the tuning criterion is based on the maximisation of the correlation between the components on the whole data set") - #we add the AUC outputs only if it was not the measure - #otherwise there is twice the same output - if(all(measure != "AUC")){ - names(auc.mean.sd) = c(paste0('comp', comp.real)) - result$auc = auc.mean.sd - } - if(light.output == FALSE) + cor.value = vector(length = length(test.keepX)) + names(cor.value) = test.keepX + + for (i in 1:length(test.keepX)) { - names(auc.all) = c(paste0('comp', comp.real)) - result$auc.all =auc.all + spls.train = splsda(X, Y, ncomp = ncomp, keepX = c(already.tested.X, test.keepX[i]), logratio = logratio, near.zero.var = FALSE) + + # Note: this is performed on the full data set + # (could be done with resampling (bootstrap) (option 1) and/or prediction (option 2)) + cor.value[i] = cor(spls.train$variates$X[, ncomp], spls.train$variates$Y[, ncomp]) + # } - } - result$measure = measure - result$call = match.call() - result$call$nrepeat = nrepeat - result$nrepeat = nrepeat - - class(result) = "tune.splsda" - - return(result) - } else { - # if multilevel with 2 factors, we can not do as before because withinvariation depends on the factors, we maximase a correlation - message("For a two-factor analysis, the tuning criterion is based on the maximisation of the correlation between the components on the whole data set") - - cor.value = vector(length = length(test.keepX)) - names(cor.value) = test.keepX - - for (i in 1:length(test.keepX)) - { - spls.train = splsda(X, Y, ncomp = ncomp, keepX = c(already.tested.X, test.keepX[i]), logratio = logratio, near.zero.var = FALSE) + return(list(cor.value = cor.value)) - # Note: this is performed on the full data set - # (could be done with resampling (bootstrap) (option 1) and/or prediction (option 2)) - cor.value[i] = cor(spls.train$variates$X[, ncomp], spls.train$variates$Y[, ncomp]) - # } - return(list(cor.value = cor.value)) - - } + } # end else statement (i.e. test.keepX is not NULL) } diff --git a/examples/tune.splsda-examples.R b/examples/tune.splsda-examples.R new file mode 100644 index 00000000..801e3681 --- /dev/null +++ b/examples/tune.splsda-examples.R @@ -0,0 +1,46 @@ +## First example: analysis with sPLS-DA +data(breast.tumors) +X = breast.tumors$gene.exp +Y = as.factor(breast.tumors$sample$treatment) + +# first tune on components only +tune = tune.splsda(X, Y, ncomp = 5, logratio = "none", + nrepeat = 10, folds = 10, + test.keepX = NULL, + progressBar = TRUE, + seed = 20) # set for reproducibility of example only +plot(tune) # optimal distance = centroids.dist +tune$choice.ncomp # optimal component number = 3 + +# then tune optimal keepX for each component +tune = tune.splsda(X, Y, ncomp = 3, logratio = "none", + nrepeat = 10, folds = 10, + test.keepX = c(5, 10, 15), dist = "centroids.dist", + progressBar = TRUE, + seed = 20) + +plot(tune) +tune$choice.keepX # optimal number of variables to keep c(15, 5, 15) + +\dontrun{ +tune = tune.splsda(X, Y, ncomp = 3, logratio = "none", + nrepeat = 10, folds = 10, + test.keepX = c(5, 10, 15), already.tested.X = c(5, 10), + dist = "centroids.dist", + progressBar = TRUE, + seed = 20) +plot(tune) + +## Second example: multilevel one-factor analysis with sPLS-DA + +data(vac18) +X = vac18$genes +Y = vac18$stimulation +# sample indicates the repeated measurements +design = data.frame(sample = vac18$sample) + +tune = tune.splsda(X, Y = Y, ncomp = 3, nrepeat = 10, logratio = "none", + test.keepX = c(5,50,100),folds = 10, dist = "max.dist", multilevel = design) + +plot(tune) +} \ No newline at end of file diff --git a/man/tune.splsda.Rd b/man/tune.splsda.Rd index a322ebb7..812d49ca 100644 --- a/man/tune.splsda.Rd +++ b/man/tune.splsda.Rd @@ -8,11 +8,11 @@ tune.splsda( X, Y, ncomp = 1, - test.keepX = c(5, 10, 15), + test.keepX = NULL, already.tested.X, validation = "Mfold", folds = 10, - dist = "max.dist", + dist = NULL, measure = "BER", scale = TRUE, auc = FALSE, @@ -38,7 +38,8 @@ responses (for multi-response models) \code{NA}s are allowed.} \item{ncomp}{the number of components to include in the model.} \item{test.keepX}{numeric vector for the different number of variables to -test from the \eqn{X} data set} +test from the \eqn{X} data set. If set to NULL, tuning will be performed on ncomp +using all variables in the \eqn{X} data set.} \item{already.tested.X}{Optional, if \code{ncomp > 1} A numeric vector indicating the number of variables to select from the \eqn{X} data set on @@ -120,8 +121,7 @@ between latent variables.} } \description{ Computes M-fold or Leave-One-Out Cross-Validation scores on a user-input -grid to determine optimal values for the sparsity parameters in -\code{splsda}. +grid to determine optimal values for the parameters in \code{splsda}. } \details{ This tuning function should be used to tune the parameters in the @@ -183,32 +183,46 @@ towards majority classes during the performance assessment. More details about the prediction distances in \code{?predict} and the supplemental material of the mixOmics article (Rohart et al. 2017). + +If test.keepX is set to NULL, the \code{perf()} function will be run internally, +which performs cross-validation to identify optimal number of components and +distance measure. Running tuning initially using \code{test.keepX = NULL} speeds +up the parameter tuning workflow, as then a lower ncomp value can be used for +variable selection tuning. } \examples{ ## First example: analysis with sPLS-DA - data(breast.tumors) X = breast.tumors$gene.exp Y = as.factor(breast.tumors$sample$treatment) -tune = tune.splsda(X, Y, ncomp = 1, nrepeat = 10, logratio = "none", -test.keepX = c(5, 10, 15), folds = 10, dist = "max.dist", -progressBar = TRUE) -\dontrun{ -# 5 components, optimising 'keepX' and 'ncomp' -tune = tune.splsda(X, Y, ncomp = 5, test.keepX = c(5, 10, 15), -folds = 10, dist = "max.dist", nrepeat = 5, progressBar = FALSE) +# first tune on components only +tune = tune.splsda(X, Y, ncomp = 5, logratio = "none", + nrepeat = 10, folds = 10, + test.keepX = NULL, + progressBar = TRUE, + seed = 20) # set for reproducibility of example only +plot(tune) # optimal distance = centroids.dist +tune$choice.ncomp # optimal component number = 3 + +# then tune optimal keepX for each component +tune = tune.splsda(X, Y, ncomp = 3, logratio = "none", + nrepeat = 10, folds = 10, + test.keepX = c(5, 10, 15), dist = "centroids.dist", + progressBar = TRUE, + seed = 20) -tune$choice.ncomp -tune$choice.keepX plot(tune) +tune$choice.keepX # optimal number of variables to keep c(15, 5, 15) -## only tune component 3 and 4 -# keeping 5 and 10 variables on the first two components respectively - -tune = tune.splsda(X = X,Y = Y, ncomp = 4, -already.tested.X = c(5,10), -test.keepX = seq(1,10,2), progressBar = TRUE) +\dontrun{ +tune = tune.splsda(X, Y, ncomp = 3, logratio = "none", + nrepeat = 10, folds = 10, + test.keepX = c(5, 10, 15), already.tested.X = c(5, 10), + dist = "centroids.dist", + progressBar = TRUE, + seed = 20) +plot(tune) ## Second example: multilevel one-factor analysis with sPLS-DA @@ -219,8 +233,9 @@ Y = vac18$stimulation design = data.frame(sample = vac18$sample) tune = tune.splsda(X, Y = Y, ncomp = 3, nrepeat = 10, logratio = "none", -test.keepX = c(5,50,100),folds = 10, dist = "max.dist", multilevel = design) + test.keepX = c(5,50,100),folds = 10, dist = "max.dist", multilevel = design) +plot(tune) } } \references{ From eee25cc98a34f353023dc43b4a2163a541d12522 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Fri, 22 Nov 2024 14:05:15 +1100 Subject: [PATCH 02/75] add unit testing for edited tune.splsda functionality --- tests/testthat/test-tune.splsda.R | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/testthat/test-tune.splsda.R b/tests/testthat/test-tune.splsda.R index 7ad95d23..3411df54 100644 --- a/tests/testthat/test-tune.splsda.R +++ b/tests/testthat/test-tune.splsda.R @@ -46,3 +46,27 @@ test_that("tune.spls works and is the same in parallel and when run in tune wrap .expect_numerically_close(tune.splsda.res.4$error.rate[1,1], 0.3111111) }) + +test_that("tune.splsda works when test.keepX = NULL and gives same result as perf()", code = { + + # set up data + data(breast.tumors) + X <- breast.tumors$gene.exp + Y <- as.factor(breast.tumors$sample$treatment) + + # tune on components only + tune_res <- suppressWarnings( + tune.splsda(X, Y, ncomp = 2, logratio = "none", + nrepeat = 1, folds = 3, + test.keepX = NULL, seed = 20) + ) + + # run perf + splsda_res <- splsda(X, Y, ncomp = 2) + perf_res <- suppressWarnings( + perf(splsda_res, ncomp = 2, nrepeat = 1, folds = 3, seed = 20) + ) + + # check results match + expect_equal(tune_res$error.rate$overall[1,2], perf_res$error.rate$overall[1,2]) +}) From e9ba70a938989211f69122187f5ff8967911b2a6 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Fri, 22 Nov 2024 14:16:06 +1100 Subject: [PATCH 03/75] update tune.spca docs as these depended on tune.splsda docs --- R/tune.spca.R | 4 +++- man/tune.spca.Rd | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/R/tune.spca.R b/R/tune.spca.R index 70802e1e..7e90873b 100644 --- a/R/tune.spca.R +++ b/R/tune.spca.R @@ -16,8 +16,10 @@ #' it is recommended to use Leave-One-Out Cross-Validation which can be #' achieved by setting \code{folds = nrow(X)}. #' @inheritParams spca -#' @inheritParams tune.splsda #' @param folds Number of folds in 'Mfold' cross-validation. See details. +#' @param nrepeat Number of times the Cross-Validation process is repeated. +#' @param test.keepX numeric vector for the different number of variables to +#' test from the \eqn{X} data set. #' @param BPPARAM A \linkS4class{BiocParallelParam} object indicating the type #' of parallelisation. See examples. #' @param seed set a number here if you want the function to give reproducible outputs. diff --git a/man/tune.spca.Rd b/man/tune.spca.Rd index d6a78ee3..1c3340fd 100644 --- a/man/tune.spca.Rd +++ b/man/tune.spca.Rd @@ -32,7 +32,7 @@ ncol(X))}} \item{folds}{Number of folds in 'Mfold' cross-validation. See details.} \item{test.keepX}{numeric vector for the different number of variables to -test from the \eqn{X} data set} +test from the \eqn{X} data set.} \item{center}{(Default=TRUE) Logical, whether the variables should be shifted to be zero centered. Only set to FALSE if data have already been centered. From b56a9e6e8c4869e5faf0bf38d9bf87a32b299226 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Fri, 22 Nov 2024 14:52:01 +1100 Subject: [PATCH 04/75] update tune.splsda docs --- R/tune.splsda.R | 11 ++++++++++- man/tune.splsda.Rd | 8 ++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/R/tune.splsda.R b/R/tune.splsda.R index 00af63e6..3aec4555 100644 --- a/R/tune.splsda.R +++ b/R/tune.splsda.R @@ -128,6 +128,14 @@ #' \item{error.rate.class}{returns the error rate for each level of \code{Y} #' and for each component computed with the optimal keepX} #' +#' If test.keepX = FALSE,produces a matrix of classification +#' error rate estimation. The dimensions correspond to the components in the +#' model and to the prediction method used, respectively. Note that error rates +#' reported in any component include the performance of the model in earlier +#' components for the specified \code{keepX} parameters (e.g. error rate +#' reported for component 3 for \code{keepX = 20} already includes the fitted +#' model on components 1 and 2 for \code{keepX = 20}). +#' #' \item{predict}{Prediction values for each sample, each \code{test.keepX}, #' each comp and each repeat. Only if light.output=FALSE} #' \item{class}{Predicted class for each sample, each \code{test.keepX}, each @@ -285,7 +293,8 @@ tune.splsda <- #-> if test.keepX set to NULL run perf() i.e. parameter tuning of just ncomp if (is.null(test.keepX)){ print("test.keepX set to NULL, tuning only for number of components...") - splsda_res <- splsda(X, Y, ncomp) + splsda_res <- splsda(X, Y, ncomp, scale = scale, tol = tol, max.iter = max.iter, + near.zero.var = near.zero.var, logratio = logratio, multilevel = multilevel) perf_res <- perf(splsda_res, validation = validation, folds = folds, nrepeat = nrepeat, dist = dist, signif.threshold = signif.threshold, diff --git a/man/tune.splsda.Rd b/man/tune.splsda.Rd index 812d49ca..465cbfa2 100644 --- a/man/tune.splsda.Rd +++ b/man/tune.splsda.Rd @@ -108,6 +108,14 @@ components for the model fitted with \code{$choice.keepX} } \item{error.rate.class}{returns the error rate for each level of \code{Y} and for each component computed with the optimal keepX} +If test.keepX = FALSE,produces a matrix of classification +error rate estimation. The dimensions correspond to the components in the +model and to the prediction method used, respectively. Note that error rates +reported in any component include the performance of the model in earlier +components for the specified \code{keepX} parameters (e.g. error rate +reported for component 3 for \code{keepX = 20} already includes the fitted +model on components 1 and 2 for \code{keepX = 20}). + \item{predict}{Prediction values for each sample, each \code{test.keepX}, each comp and each repeat. Only if light.output=FALSE} \item{class}{Predicted class for each sample, each \code{test.keepX}, each From 07ad57d431650daed872e3dfa71c7154407e1736 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Fri, 22 Nov 2024 14:52:09 +1100 Subject: [PATCH 05/75] init tune.plsda which calls perf --- NAMESPACE | 1 + R/tune.plsda.R | 253 ++++++++++++++++++++++++++++++++++++++++++++++ man/tune.plsda.Rd | 184 +++++++++++++++++++++++++++++++++ 3 files changed, 438 insertions(+) create mode 100644 R/tune.plsda.R create mode 100644 man/tune.plsda.Rd diff --git a/NAMESPACE b/NAMESPACE index 59c51e6d..ae073e9f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -172,6 +172,7 @@ export(tune) export(tune.block.splsda) export(tune.mint.splsda) export(tune.pca) +export(tune.plsda) export(tune.rcc) export(tune.spca) export(tune.spls) diff --git a/R/tune.plsda.R b/R/tune.plsda.R new file mode 100644 index 00000000..801c1235 --- /dev/null +++ b/R/tune.plsda.R @@ -0,0 +1,253 @@ +# ======================================================================================================== +# tune.plsda: tuning hyperparameters on a plsda method +# ======================================================================================================== + +#' Tuning functions for PLS-DA method +#' +#' Computes M-fold or Leave-One-Out Cross-Validation scores on a user-input +#' grid to determine optimal values for the parameters in \code{plsda}. +#' +#' +#' This tuning function should be used to tune the parameters in the +#' \code{plsda} function (number of components and distance metric to select). +#' +#' For a PLS-DA, M-fold or LOO cross-validation is performed with stratified +#' subsampling where all classes are represented in each fold. +#' +#' If \code{validation = "loo"}, leave-one-out cross-validation is performed. +#' By default \code{folds} is set to the number of unique individuals. +#' +#' The function outputs the optimal number of components that achieve the best +#' performance based on the overall error rate or BER. The assessment is +#' data-driven and similar to the process detailed in (Rohart et al., 2016), +#' where one-sided t-tests assess whether there is a gain in performance when +#' adding a component to the model. Our experience has shown that in most case, +#' the optimal number of components is the number of categories in \code{Y} - +#' 1, but it is worth tuning a few extra components to check (see our website +#' and case studies for more details). +#' +#' For PLS-DA multilevel one-factor analysis, M-fold or LOO cross-validation +#' is performed where all repeated measurements of one sample are in the same +#' fold. Note that logratio transform and the multilevel analysis are performed +#' internally and independently on the training and test set. +#' +#' For a PLS-DA multilevel two-factor analysis, the correlation between +#' components from the within-subject variation of X and the \code{cond} matrix +#' is computed on the whole data set. The reason why we cannot obtain a +#' cross-validation error rate as for the pls-DA one-factor analysis is +#' because of the difficulty to decompose and predict the within matrices +#' within each fold. +#' +#' For a PLS two-factor analysis a PLS canonical mode is run, and the +#' correlation between components from the within-subject variation of X and Y +#' is computed on the whole data set. +#' +#' If \code{validation = "Mfold"}, M-fold cross-validation is performed. How +#' many folds to generate is selected by specifying the number of folds in +#' \code{folds}. +#' +#' If \code{auc = TRUE} and there are more than 2 categories in \code{Y}, the +#' Area Under the Curve is averaged using one-vs-all comparison. Note however +#' that the AUC criteria may not be particularly insightful as the prediction +#' threshold we use in sPLS-DA differs from an AUC threshold (PLS-DA relies on +#' prediction distances for predictions, see \code{?predict.plsda} for more +#' details) and the supplemental material of the mixOmics article (Rohart et +#' al. 2017). If you want the AUC criterion to be insightful, you should use +#' \code{measure==AUC} as this will output the number of variable that +#' maximises the AUC; in this case there is no prediction threshold from +#' PLS-DA (\code{dist} is not used). If \code{measure==AUC}, we do not output +#' SD as this measure can be a mean (over \code{nrepeat}) of means (over the +#' categories). +#' +#' BER is appropriate in case of an unbalanced number of samples per class as +#' it calculates the average proportion of wrongly classified samples in each +#' class, weighted by the number of samples in each class. BER is less biased +#' towards majority classes during the performance assessment. +#' +#' More details about the prediction distances in \code{?predict} and the +#' supplemental material of the mixOmics article (Rohart et al. 2017). +#' +#' The tune.plsda() function calls older function perf() to perform this cross-validation, +#' for more details see the perf() help pages. +#' +#' @param X numeric matrix of predictors. \code{NA}s are allowed. +#' @param Y \code{if(method = 'spls')} numeric vector or matrix of continuous +#' responses (for multi-response models) \code{NA}s are allowed. +#' @param ncomp the number of components to include in the model. +#' @param validation character. What kind of (internal) validation to use, +#' matching one of \code{"Mfold"} or \code{"loo"} (short for 'leave-one-out'). +#' Default is \code{"Mfold"}. +#' @param folds the folds in the Mfold cross-validation. See Details. +#' @param dist distance metric to use for \code{splsda} to estimate the +#' classification error rate, should be a subset of \code{"centroids.dist"}, +#' \code{"mahalanobis.dist"} or \code{"max.dist"} (see Details). +#' @param measure Three misclassification measure are available: overall +#' misclassification error \code{overall}, the Balanced Error Rate \code{BER} +#' or the Area Under the Curve \code{AUC} +#' @param scale Logical. If scale = TRUE, each block is standardized to zero +#' means and unit variances (default: TRUE) +#' @param auc if \code{TRUE} calculate the Area Under the Curve (AUC) +#' performance of the model based on the optimisation measure \code{measure}. +#' @param progressBar by default set to \code{TRUE} to output the progress bar +#' of the computation. +#' @param tol Convergence stopping value. +#' @param max.iter integer, the maximum number of iterations. +#' @param near.zero.var Logical, see the internal \code{\link{nearZeroVar}} +#' function (should be set to TRUE in particular for data with many zero +#' values). Default value is FALSE +#' @param nrepeat Number of times the Cross-Validation process is repeated. +#' @param logratio one of ('none','CLR'). Default to 'none' +#' @param multilevel Design matrix for multilevel analysis (for repeated +#' measurements) that indicates the repeated measures on each individual, i.e. +#' the individuals ID. See Details. +#' @param light.output if set to FALSE, the prediction/classification of each +#' sample for each of \code{test.keepX} and each comp is returned. +#' @param signif.threshold numeric between 0 and 1 indicating the significance +#' threshold required for improvement in error rate of the components. Default +#' to 0.01. +#' @template arg/BPPARAM +#' @param seed set a number here if you want the function to give reproducible outputs. +#' Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'. +#' @return matrix of classification error rate estimation. +#' The dimensions correspond to the components in the +#' model and to the prediction method used, respectively. +#' +#' \item{auc}{Averaged AUC values +#' over the \code{nrepeat}} +#' +#' \item{cor.value}{only if multilevel analysis with 2 factors: correlation +#' between latent variables.} +#' +#' @author Kim-Anh Lê Cao, Benoit Gautier, Francois Bartolo, Florian Rohart, +#' Al J Abadi +#' @seealso \code{\link{splsda}}, \code{\link{predict.splsda}} and +#' http://www.mixOmics.org for more details. +#' @references mixOmics article: +#' +#' Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics +#' feature selection and multiple data integration. PLoS Comput Biol 13(11): +#' e1005752 +#' @keywords regression multivariate +#' @export +#' @example ./examples/tune.plsda-examples.R + +tune.plsda <- + function (X, Y, + ncomp = 1, + validation = "Mfold", + folds = 10, + dist = NULL, + measure = "BER", # one of c("overall","BER") + scale = TRUE, + auc = FALSE, + progressBar = FALSE, + tol = 1e-06, + max.iter = 100, + near.zero.var = FALSE, + nrepeat = 1, + logratio = c('none','CLR'), + multilevel = NULL, + light.output = TRUE, + signif.threshold = 0.01, + BPPARAM = SerialParam(), + seed = NULL + ) + { #-- checking general input parameters --------------------------------------# + #---------------------------------------------------------------------------# + + BPPARAM$RNGseed <- seed + set.seed(seed) + + #-- check significance threshold + signif.threshold <- .check_alpha(signif.threshold) + + #------------------# + #-- check entries --# + if(!is(X, "matrix")) + X = as.matrix(X) + + if (length(dim(X)) != 2 || !is.numeric(X)) + stop("'X' must be a numeric matrix.") + + + # Testing the input Y + if (is.null(multilevel)) + { + if (is.null(Y)) + stop("'Y' has to be something else than NULL.") + + if (is.null(dim(Y))) + { + Y = factor(Y) + } else { + stop("'Y' should be a factor or a class vector.") + } + + if (nlevels(Y) == 1) + stop("'Y' should be a factor with more than one level") + + } else { + # we expect a vector or a 2-columns matrix in 'Y' and the repeated measurements in 'multilevel' + multilevel = data.frame(multilevel) + + if ((nrow(X) != nrow(multilevel))) + stop("unequal number of rows in 'X' and 'multilevel'.") + + if (ncol(multilevel) != 1) + stop("'multilevel' should have a single column for the repeated measurements, other factors should be included in 'Y'.") + + if (!is.null(ncol(Y)) && !ncol(Y) %in% c(0,1,2))# multilevel 1 or 2 factors + stop("'Y' should either be a factor, a single column data.frame containing a factor, or a 2-columns data.frame containing 2 factors.") + + multilevel = data.frame(multilevel, Y) + multilevel[, 1] = as.numeric(factor(multilevel[, 1])) # we want numbers for the repeated measurements + + } + + + #-- progressBar + if (!is.logical(progressBar)) + stop("'progressBar' must be a logical constant (TRUE or FALSE).", call. = FALSE) + + + if (is.null(ncomp) || !is.numeric(ncomp) || ncomp <= 0) + stop("invalid number of variates, 'ncomp'.") + + + #-- validation + choices = c("Mfold", "loo") + validation = choices[pmatch(validation, choices)] + if (is.na(validation)) + stop("'validation' must be either 'Mfold' or 'loo'") + + if (validation == 'loo') + { + if (nrepeat != 1) + warning("Leave-One-Out validation does not need to be repeated: 'nrepeat' is set to '1'.") + nrepeat = 1 + } + + if (nrepeat < 3 & ncomp > 1) + message("Note that the number of components cannot be reliably tuned with nrepeat < 3 or validaion = 'loo'.") + + #-- logratio + logratio <- match.arg(logratio) + + #-- measure + choices = c("BER", "overall","AUC") + measure = choices[pmatch(measure, choices)] + if (is.na(measure)) + stop("'measure' must be either 'BER', 'overall' or 'AUC' ") + + if (any(is.na(validation)) || length(validation) > 1) + stop("'validation' should be one of 'Mfold' or 'loo'.", call. = FALSE) + + plsda_res <- plsda(X, Y, ncomp, scale = scale, tol = tol, max.iter = max.iter, + near.zero.var = near.zero.var, logratio = logratio, multilevel = multilevel) + perf_res <- perf(plsda_res, + validation = validation, folds = folds, nrepeat = nrepeat, + dist = dist, signif.threshold = signif.threshold, + BPPARAM = BPPARAM, seed = seed, progressBar = progressBar, + auc = auc) + return(perf_res) + } \ No newline at end of file diff --git a/man/tune.plsda.Rd b/man/tune.plsda.Rd new file mode 100644 index 00000000..ee63d7ce --- /dev/null +++ b/man/tune.plsda.Rd @@ -0,0 +1,184 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tune.plsda.R +\name{tune.plsda} +\alias{tune.plsda} +\title{Tuning functions for PLS-DA method} +\usage{ +tune.plsda( + X, + Y, + ncomp = 1, + validation = "Mfold", + folds = 10, + dist = NULL, + measure = "BER", + scale = TRUE, + auc = FALSE, + progressBar = FALSE, + tol = 1e-06, + max.iter = 100, + near.zero.var = FALSE, + nrepeat = 1, + logratio = c("none", "CLR"), + multilevel = NULL, + light.output = TRUE, + signif.threshold = 0.01, + BPPARAM = SerialParam(), + seed = NULL +) +} +\arguments{ +\item{X}{numeric matrix of predictors. \code{NA}s are allowed.} + +\item{Y}{\code{if(method = 'spls')} numeric vector or matrix of continuous +responses (for multi-response models) \code{NA}s are allowed.} + +\item{ncomp}{the number of components to include in the model.} + +\item{validation}{character. What kind of (internal) validation to use, +matching one of \code{"Mfold"} or \code{"loo"} (short for 'leave-one-out'). +Default is \code{"Mfold"}.} + +\item{folds}{the folds in the Mfold cross-validation. See Details.} + +\item{dist}{distance metric to use for \code{splsda} to estimate the +classification error rate, should be a subset of \code{"centroids.dist"}, +\code{"mahalanobis.dist"} or \code{"max.dist"} (see Details).} + +\item{measure}{Three misclassification measure are available: overall +misclassification error \code{overall}, the Balanced Error Rate \code{BER} +or the Area Under the Curve \code{AUC}} + +\item{scale}{Logical. If scale = TRUE, each block is standardized to zero +means and unit variances (default: TRUE)} + +\item{auc}{if \code{TRUE} calculate the Area Under the Curve (AUC) +performance of the model based on the optimisation measure \code{measure}.} + +\item{progressBar}{by default set to \code{TRUE} to output the progress bar +of the computation.} + +\item{tol}{Convergence stopping value.} + +\item{max.iter}{integer, the maximum number of iterations.} + +\item{near.zero.var}{Logical, see the internal \code{\link{nearZeroVar}} +function (should be set to TRUE in particular for data with many zero +values). Default value is FALSE} + +\item{nrepeat}{Number of times the Cross-Validation process is repeated.} + +\item{logratio}{one of ('none','CLR'). Default to 'none'} + +\item{multilevel}{Design matrix for multilevel analysis (for repeated +measurements) that indicates the repeated measures on each individual, i.e. +the individuals ID. See Details.} + +\item{light.output}{if set to FALSE, the prediction/classification of each +sample for each of \code{test.keepX} and each comp is returned.} + +\item{signif.threshold}{numeric between 0 and 1 indicating the significance +threshold required for improvement in error rate of the components. Default +to 0.01.} + +\item{BPPARAM}{A \linkS4class{BiocParallelParam} object indicating the type +of parallelisation. See examples in \code{?tune.spca}.} + +\item{seed}{set a number here if you want the function to give reproducible outputs. +Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'.} +} +\value{ +matrix of classification error rate estimation. +The dimensions correspond to the components in the +model and to the prediction method used, respectively. + +\item{auc}{Averaged AUC values +over the \code{nrepeat}} + +\item{cor.value}{only if multilevel analysis with 2 factors: correlation +between latent variables.} +} +\description{ +Computes M-fold or Leave-One-Out Cross-Validation scores on a user-input +grid to determine optimal values for the parameters in \code{plsda}. +} +\details{ +This tuning function should be used to tune the parameters in the +\code{plsda} function (number of components and distance metric to select). + +For a PLS-DA, M-fold or LOO cross-validation is performed with stratified +subsampling where all classes are represented in each fold. + +If \code{validation = "loo"}, leave-one-out cross-validation is performed. +By default \code{folds} is set to the number of unique individuals. + +The function outputs the optimal number of components that achieve the best +performance based on the overall error rate or BER. The assessment is +data-driven and similar to the process detailed in (Rohart et al., 2016), +where one-sided t-tests assess whether there is a gain in performance when +adding a component to the model. Our experience has shown that in most case, +the optimal number of components is the number of categories in \code{Y} - +1, but it is worth tuning a few extra components to check (see our website +and case studies for more details). + +For PLS-DA multilevel one-factor analysis, M-fold or LOO cross-validation +is performed where all repeated measurements of one sample are in the same +fold. Note that logratio transform and the multilevel analysis are performed +internally and independently on the training and test set. + +For a PLS-DA multilevel two-factor analysis, the correlation between +components from the within-subject variation of X and the \code{cond} matrix +is computed on the whole data set. The reason why we cannot obtain a +cross-validation error rate as for the pls-DA one-factor analysis is +because of the difficulty to decompose and predict the within matrices +within each fold. + +For a PLS two-factor analysis a PLS canonical mode is run, and the +correlation between components from the within-subject variation of X and Y +is computed on the whole data set. + +If \code{validation = "Mfold"}, M-fold cross-validation is performed. How +many folds to generate is selected by specifying the number of folds in +\code{folds}. + +If \code{auc = TRUE} and there are more than 2 categories in \code{Y}, the +Area Under the Curve is averaged using one-vs-all comparison. Note however +that the AUC criteria may not be particularly insightful as the prediction +threshold we use in sPLS-DA differs from an AUC threshold (PLS-DA relies on +prediction distances for predictions, see \code{?predict.plsda} for more +details) and the supplemental material of the mixOmics article (Rohart et +al. 2017). If you want the AUC criterion to be insightful, you should use +\code{measure==AUC} as this will output the number of variable that +maximises the AUC; in this case there is no prediction threshold from +PLS-DA (\code{dist} is not used). If \code{measure==AUC}, we do not output +SD as this measure can be a mean (over \code{nrepeat}) of means (over the +categories). + +BER is appropriate in case of an unbalanced number of samples per class as +it calculates the average proportion of wrongly classified samples in each +class, weighted by the number of samples in each class. BER is less biased +towards majority classes during the performance assessment. + +More details about the prediction distances in \code{?predict} and the +supplemental material of the mixOmics article (Rohart et al. 2017). + +The tune.plsda() function calls older function perf() to perform this cross-validation, +for more details see the perf() help pages. +} +\references{ +mixOmics article: + +Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics +feature selection and multiple data integration. PLoS Comput Biol 13(11): +e1005752 +} +\seealso{ +\code{\link{splsda}}, \code{\link{predict.splsda}} and +http://www.mixOmics.org for more details. +} +\author{ +Kim-Anh Lê Cao, Benoit Gautier, Francois Bartolo, Florian Rohart, +Al J Abadi +} +\keyword{multivariate} +\keyword{regression} From 0670a6695ad059ea9bfcdcf1c429c1c0ec093f95 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Fri, 22 Nov 2024 14:53:41 +1100 Subject: [PATCH 06/75] add tune.plsda examples --- examples/tune.plsda-examples.R | 12 ++++++++++++ man/tune.plsda.Rd | 14 ++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 examples/tune.plsda-examples.R diff --git a/examples/tune.plsda-examples.R b/examples/tune.plsda-examples.R new file mode 100644 index 00000000..91cb039f --- /dev/null +++ b/examples/tune.plsda-examples.R @@ -0,0 +1,12 @@ +## Example: analysis with PLS-DA +data(breast.tumors) +X = breast.tumors$gene.exp +Y = as.factor(breast.tumors$sample$treatment) + +# tune components and distance +tune = tune.plsda(X, Y, ncomp = 5, logratio = "none", + nrepeat = 10, folds = 10, + progressBar = TRUE, + seed = 20) # set for reproducibility of example only +plot(tune) # optimal distance = centroids.dist +tune$choice.ncomp # optimal component number = 3 \ No newline at end of file diff --git a/man/tune.plsda.Rd b/man/tune.plsda.Rd index ee63d7ce..8b29a26d 100644 --- a/man/tune.plsda.Rd +++ b/man/tune.plsda.Rd @@ -165,6 +165,20 @@ supplemental material of the mixOmics article (Rohart et al. 2017). The tune.plsda() function calls older function perf() to perform this cross-validation, for more details see the perf() help pages. } +\examples{ +## Example: analysis with PLS-DA +data(breast.tumors) +X = breast.tumors$gene.exp +Y = as.factor(breast.tumors$sample$treatment) + +# tune components and distance +tune = tune.plsda(X, Y, ncomp = 5, logratio = "none", + nrepeat = 10, folds = 10, + progressBar = TRUE, + seed = 20) # set for reproducibility of example only +plot(tune) # optimal distance = centroids.dist +tune$choice.ncomp # optimal component number = 3 +} \references{ mixOmics article: From 47ae900106243a8356f5b8f1dcaebc6a2bd2df43 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Mon, 25 Nov 2024 12:50:26 +1100 Subject: [PATCH 07/75] update tune.spls to run perf if test.keepX and test.keepY are NULL, update docs to reflect --- R/tune.spls.R | 100 +++++++++++++++++++++++++++++++++++++++++------ man/tune.spls.Rd | 71 +++++++++++++++++++++++++++++---- 2 files changed, 152 insertions(+), 19 deletions(-) diff --git a/R/tune.spls.R b/R/tune.spls.R index 6d1050c5..e7bd8b02 100644 --- a/R/tune.spls.R +++ b/R/tune.spls.R @@ -27,14 +27,18 @@ # ======================================================================================================== -# tune.spls: chose the optimal number of parameters per component on a spls method +# tune.spls: Tuning hyperparameters on a spls method # ======================================================================================================== -# TODO details on nrepeat -# TODO details on pls vs spls -# TODO tidy outputs preferably in an array -#' Tuning functions for sPLS and PLS functions #' -#' @template description/tune +#' Tuning functions for sPLS method +#' +#' Computes M-fold or Leave-One-Out Cross-Validation scores on a user-input +#' grid to determine optimal values for the parameters in \code{spls}. +#' +#' +#' This tuning function should be used to tune the parameters in the +#' \code{spls} function (number of components and number of variables to select). +#' #' #' @template section/folds #' @template section/nrepeat @@ -49,19 +53,27 @@ #' @template arg/ncomp #' @template arg/test.keepX-X.matrix #' @template arg/test.keepY -#' @templateVar modes \code{"regression"}, \code{"canonical"}, \code{"invariant"} or \code{"classic"} -#' @template arg/mode #' @template arg/validation #' @template arg/folds #' @template arg/nrepeat #' @param measure The tuning measure to use. See details. +#' @templateVar modes \code{"regression"}, \code{"canonical"}, \code{"invariant"} or \code{"classic"} +#' @template arg/mode +#' @param arg/scale Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE +#' @param arg/tol Positive numeric used as convergence criteria/tolerance during the iterative process. Default to 1e-06. +#' @param arg/max.iter Integer, the maximum number of iterations. Default to 100. +#' @param arg/near.zero.var Logical, see the internal nearZeroVar function (should be set to TRUE in particular for data with many zero values). Setting this argument to FALSE (when appropriate) will speed up the computations. Default value is FALSE. +#' @param arg/logratio Character, one of ('none','CLR') specifies the log ratio transformation to deal with compositional values that may arise from specific normalisation in sequencing data. Default to 'none'. See ?logratio.transfo for details. +#' @param arg/multilevel Numeric, design matrix for repeated measurement analysis, where multilevel decomposition is required. For a one factor decomposition, the repeated measures on each individual, i.e. the individuals ID is input as the first column. For a 2 level factor decomposition then 2nd AND 3rd columns indicate those factors. See examples. #' @template arg/progressBar #' @template arg/BPPARAM #' @param seed set a number here if you want the function to give reproducible outputs. #' Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'. #' @param limQ2 Q2 threshold for recommending optimal \code{ncomp}. #' @param ... Optional parameters passed to \code{\link{spls}} -#' @return A list that contains: \item{cor.pred}{The correlation of predicted vs +#' @return +#' If \code{test.keepX != NULL} and \code{test.keepY != NULL} returns a list that contains: +#' \item{cor.pred}{The correlation of predicted vs #' actual components from X (t) and Y (u) for each #' component}\item{RSS.pred}{The Residual Sum of Squares of predicted vs #' actual components from X (t) and Y (u) for each component} @@ -71,6 +83,41 @@ #' \item{choice.ncomp}{returns the optimal number of components for the model #' fitted with \code{$choice.keepX} and \code{$choice.keepY} } \item{call}{The #' functioncal call including the parameteres used.} +#' +#' If \code{test.keepX = NULL} and \code{test.keepY = NULL} returns a list with the following components for every repeat: +#' \item{MSEP}{Mean Square Error Prediction for each \eqn{Y} variable, only +#' applies to object inherited from \code{"pls"}, and \code{"spls"}. Only +#' available when in regression (s)PLS.} +#' \item{RMSEP}{Root Mean Square Error Prediction for each \eqn{Y} variable, only +#' applies to object inherited from \code{"pls"}, and \code{"spls"}. Only +#' available when in regression (s)PLS.} +#' \item{R2}{a matrix of \eqn{R^2} values of the \eqn{Y}-variables for models +#' with \eqn{1, \ldots ,}\code{ncomp} components, only applies to object +#' inherited from \code{"pls"}, and \code{"spls"}. Only available when in +#' regression (s)PLS.} +#' \item{Q2}{if \eqn{Y} contains one variable, a vector of \eqn{Q^2} values +#' else a list with a matrix of \eqn{Q^2} values for each \eqn{Y}-variable. +#' Note that in the specific case of an sPLS model, it is better to have a look +#' at the Q2.total criterion, only applies to object inherited from +#' \code{"pls"}, and \code{"spls"}. Only available when in regression (s)PLS.} +#' \item{Q2.total}{a vector of \eqn{Q^2}-total values for models with \eqn{1, +#' \ldots ,}\code{ncomp} components, only applies to object inherited from +#' \code{"pls"}, and \code{"spls"}. Available in both (s)PLS modes.} +#' \item{RSS}{Residual Sum of Squares across all selected features and the +#' components.} +#' \item{PRESS}{Predicted Residual Error Sum of Squares across all selected +#' features and the components.} +#' \item{features}{a list of features selected across the +#' folds (\code{$stable.X} and \code{$stable.Y}) for the \code{keepX} and +#' \code{keepY} parameters from the input object. Note, this will be \code{NULL} +#' if using standard (non-sparse) PLS.} +#' \item{cor.tpred, cor.upred}{Correlation between the +#' predicted and actual components for X (t) and Y (u)} +#' \item{RSS.tpred, RSS.upred}{Residual Sum of Squares between the +#' predicted and actual components for X (t) and Y (u)} +#' \item{error.rate} +#' +#' #' @author Kim-Anh Lê Cao, Al J Abadi, Benoit Gautier, Francois Bartolo, #' Florian Rohart, #' @seealso \code{\link{splsda}}, \code{\link{predict.splsda}} and @@ -130,17 +177,29 @@ tune.spls <- test.keepX = NULL, test.keepY = NULL, ncomp, + # params related to CV validation = c('Mfold', 'loo'), nrepeat = 1, folds, - mode = c('regression', 'canonical', 'classic'), measure = NULL, + # params related to spls model building + mode = c('regression', 'canonical', 'classic'), + scale = TRUE, + logratio = "none", + tol = 1e-06, + max.iter = 100, + near.zero.var = FALSE, + multilevel = NULL, + # params related to running tune BPPARAM = SerialParam(), seed = NULL, progressBar = FALSE, limQ2 = 0.0975, ... ) { + + #-- checking general input parameters --------------------------------------# + #---------------------------------------------------------------------------# out = list() mode <- match.arg(mode) @@ -156,6 +215,20 @@ tune.spls <- nrepeat <- check_cv$nrepeat folds <- check_cv$folds + #-- test.keepX + #-> if test.keepX set to NULL run perf() i.e. parameter tuning of just ncomp using all features + if (is.null(test.keepX) && is.null(test.keepY)){ + print("test.keepX and test.keepY are set to NULL, tuning only for number of components...") + spls_res <- spls(X, Y, ncomp, keepX = ncol(X), keepY = ncol(Y), + mode = mode, scale = scale, logratio = logratio, tol = tol, max.iter = max.iter, near.zero.var = near.zero.var, multilevel = multilevel) + perf_res <- perf(spls_res, + validation = validation, folds = folds, nrepeat = nrepeat, + dist = dist, + BPPARAM = BPPARAM, seed = seed, progressBar = progressBar) + return(perf_res) + + } else { + test.keepX <- .change_if_null(arg = test.keepX, default = ncol(X)) test.keepY <- .change_if_null(arg = test.keepY, default = ncol(Y)) @@ -244,7 +317,11 @@ tune.spls <- pls.model <- spls(X = X, Y = Y, keepX = c(choice.keepX, keepX), keepY = c(choice.keepY, keepY), - ncomp = comp, mode = mode, ...) + ncomp = comp, mode = mode, + scale = scale, tol = tol, max.iter = max.iter, + near.zero.var = near.zero.var, logratio = logratio, + multilevel = multilevel, + ...) # run perf in serial to avoid nested parallel processes, but make sure seed is passed into perf pls.perf <- perf(pls.model, validation = validation, folds = folds, nrepeat = nrepeat, @@ -456,4 +533,5 @@ tune.spls <- class <- c('tune.pls') class(out) <- if (spls.model) class <- c(class, 'tune.spls') return(out) + } } diff --git a/man/tune.spls.Rd b/man/tune.spls.Rd index d9250d14..1628da4f 100644 --- a/man/tune.spls.Rd +++ b/man/tune.spls.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/tune.spls.R \name{tune.spls} \alias{tune.spls} -\title{Tuning functions for sPLS and PLS functions} +\title{Tuning functions for sPLS method} \usage{ tune.spls( X, @@ -13,8 +13,14 @@ tune.spls( validation = c("Mfold", "loo"), nrepeat = 1, folds, - mode = c("regression", "canonical", "classic"), measure = NULL, + mode = c("regression", "canonical", "classic"), + scale = TRUE, + logratio = "none", + tol = 1e-06, + max.iter = 100, + near.zero.var = FALSE, + multilevel = NULL, BPPARAM = SerialParam(), seed = NULL, progressBar = FALSE, @@ -46,11 +52,11 @@ details.} \item{folds}{Positive Integer, The folds in the Mfold cross-validation.} +\item{measure}{The tuning measure to use. See details.} + \item{mode}{Character string indicating the type of PLS algorithm to use. One of \code{"regression"}, \code{"canonical"}, \code{"invariant"} or \code{"classic"}. See Details.} -\item{measure}{The tuning measure to use. See details.} - \item{BPPARAM}{A \linkS4class{BiocParallelParam} object indicating the type of parallelisation. See examples in \code{?tune.spca}.} @@ -63,9 +69,22 @@ computation completes. Default to \code{FALSE}.} \item{limQ2}{Q2 threshold for recommending optimal \code{ncomp}.} \item{...}{Optional parameters passed to \code{\link{spls}}} + +\item{arg/scale}{Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE} + +\item{arg/tol}{Positive numeric used as convergence criteria/tolerance during the iterative process. Default to 1e-06.} + +\item{arg/max.iter}{Integer, the maximum number of iterations. Default to 100.} + +\item{arg/near.zero.var}{Logical, see the internal nearZeroVar function (should be set to TRUE in particular for data with many zero values). Setting this argument to FALSE (when appropriate) will speed up the computations. Default value is FALSE.} + +\item{arg/logratio}{Character, one of ('none','CLR') specifies the log ratio transformation to deal with compositional values that may arise from specific normalisation in sequencing data. Default to 'none'. See ?logratio.transfo for details.} + +\item{arg/multilevel}{Numeric, design matrix for repeated measurement analysis, where multilevel decomposition is required. For a one factor decomposition, the repeated measures on each individual, i.e. the individuals ID is input as the first column. For a 2 level factor decomposition then 2nd AND 3rd columns indicate those factors. See examples.} } \value{ -A list that contains: \item{cor.pred}{The correlation of predicted vs +If \code{test.keepX != NULL} and \code{test.keepY != NULL} returns a list that contains: +\item{cor.pred}{The correlation of predicted vs actual components from X (t) and Y (u) for each component}\item{RSS.pred}{The Residual Sum of Squares of predicted vs actual components from X (t) and Y (u) for each component} @@ -75,11 +94,47 @@ A list that contains: \item{cor.pred}{The correlation of predicted vs \item{choice.ncomp}{returns the optimal number of components for the model fitted with \code{$choice.keepX} and \code{$choice.keepY} } \item{call}{The functioncal call including the parameteres used.} + +If \code{test.keepX = NULL} and \code{test.keepY = NULL} returns a list with the following components for every repeat: +\item{MSEP}{Mean Square Error Prediction for each \eqn{Y} variable, only +applies to object inherited from \code{"pls"}, and \code{"spls"}. Only +available when in regression (s)PLS.} +\item{RMSEP}{Root Mean Square Error Prediction for each \eqn{Y} variable, only +applies to object inherited from \code{"pls"}, and \code{"spls"}. Only +available when in regression (s)PLS.} +\item{R2}{a matrix of \eqn{R^2} values of the \eqn{Y}-variables for models +with \eqn{1, \ldots ,}\code{ncomp} components, only applies to object +inherited from \code{"pls"}, and \code{"spls"}. Only available when in +regression (s)PLS.} +\item{Q2}{if \eqn{Y} contains one variable, a vector of \eqn{Q^2} values +else a list with a matrix of \eqn{Q^2} values for each \eqn{Y}-variable. +Note that in the specific case of an sPLS model, it is better to have a look +at the Q2.total criterion, only applies to object inherited from +\code{"pls"}, and \code{"spls"}. Only available when in regression (s)PLS.} +\item{Q2.total}{a vector of \eqn{Q^2}-total values for models with \eqn{1, +\ldots ,}\code{ncomp} components, only applies to object inherited from +\code{"pls"}, and \code{"spls"}. Available in both (s)PLS modes.} +\item{RSS}{Residual Sum of Squares across all selected features and the +components.} +\item{PRESS}{Predicted Residual Error Sum of Squares across all selected +features and the components.} +\item{features}{a list of features selected across the +folds (\code{$stable.X} and \code{$stable.Y}) for the \code{keepX} and +\code{keepY} parameters from the input object. Note, this will be \code{NULL} +if using standard (non-sparse) PLS.} +\item{cor.tpred, cor.upred}{Correlation between the +predicted and actual components for X (t) and Y (u)} +\item{RSS.tpred, RSS.upred}{Residual Sum of Squares between the +predicted and actual components for X (t) and Y (u)} +\item{error.rate} } \description{ -This function uses repeated cross-validation to tune hyperparameters such as -the number of features to select and possibly the number of components to -extract. +Computes M-fold or Leave-One-Out Cross-Validation scores on a user-input +grid to determine optimal values for the parameters in \code{spls}. +} +\details{ +This tuning function should be used to tune the parameters in the +\code{spls} function (number of components and number of variables to select). } \section{folds}{ From da13ece50c1b418274903526f37a610a6a3c153a Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Mon, 25 Nov 2024 13:07:28 +1100 Subject: [PATCH 08/75] add unit test for new tune.spls functionality --- tests/testthat/test-tune.spls.R | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/testthat/test-tune.spls.R b/tests/testthat/test-tune.spls.R index c990e9c0..8e54b23e 100644 --- a/tests/testthat/test-tune.spls.R +++ b/tests/testthat/test-tune.spls.R @@ -114,3 +114,27 @@ test_that("tune.spls.1 works and is the same in parallel and when run in tune wr expect_equal(tune.spls1.MAE.1$measure.pred$mean, tune.spls1.MAE.3$measure.pred$mean) expect_equal(tune.spls1.MAE.1$measure.pred$mean, tune.spls1.MAE.4$measure.pred$mean) }) + +test_that("tune.spls works when test.keepX and test.keepY = NULL and gives same result as perf()", code = { + + # set up data + data(liver.toxicity) + X <- liver.toxicity$gene + Y <- liver.toxicity$clinic + + # tune on components only + tune_res <- suppressWarnings( + tune.spls(X, Y, ncomp = 2, logratio = "none", + nrepeat = 1, folds = 3, measure = "cor", + test.keepX = NULL, test.keepY = NULL, seed = 20) + ) + + # run perf + spls_res <- spls(X, Y, ncomp = 2, logratio = "none") + perf_res <- suppressWarnings( + perf(spls_res, ncomp = 2, nrepeat = 1, folds = 3, seed = 20) + ) + + # check results match + expect_equal(tune_res$measures$MSEP$summary[1,3], perf_res$measures$MSEP$summary[1,3]) +}) From fb825b08e627161ba5e01ab827fb17a3fad91cf1 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Mon, 25 Nov 2024 13:13:23 +1100 Subject: [PATCH 09/75] add few more tests for different params to check tune.spls works --- tests/testthat/test-tune.spls.R | 35 +++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tests/testthat/test-tune.spls.R b/tests/testthat/test-tune.spls.R index 8e54b23e..a2773e01 100644 --- a/tests/testthat/test-tune.spls.R +++ b/tests/testthat/test-tune.spls.R @@ -138,3 +138,38 @@ test_that("tune.spls works when test.keepX and test.keepY = NULL and gives same # check results match expect_equal(tune_res$measures$MSEP$summary[1,3], perf_res$measures$MSEP$summary[1,3]) }) + +test_that("tune.spls works in different test cases", code = { + + # set up data + data(liver.toxicity) + X <- liver.toxicity$gene + Y <- liver.toxicity$clinic + list.keepX <- c(5:10, seq(15, 50, 5)) + + # near.zero.var = TRUE + expect_no_error( + tune.spls(X, Y, ncomp = 2, logratio = "none", near.zero.var = TRUE, + nrepeat = 1, folds = 3, measure = "cor", + test.keepX = NULL, test.keepY = NULL, seed = 20) + ) + expect_no_error( + tune.spls(X, Y, ncomp = 2, logratio = "none", near.zero.var = TRUE, + nrepeat = 1, folds = 3, measure = "cor", + test.keepX = list.keepX, test.keepY = NULL, seed = 20) + ) + + # canonical mode + expect_no_error( + tune.spls(X, Y, ncomp = 2, logratio = "none", near.zero.var = TRUE, + nrepeat = 1, folds = 3, measure = "cor", mode = "canonical", + test.keepX = NULL, test.keepY = NULL, seed = 20) + ) + expect_no_error( + tune.spls(X, Y, ncomp = 2, logratio = "none", near.zero.var = TRUE, + nrepeat = 1, folds = 3, measure = "cor", mode = "canonical", + test.keepX = list.keepX, test.keepY = NULL, seed = 20) + ) +}) + + From c8f2d8c547b39163f294760a242a072c0533cca0 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Mon, 25 Nov 2024 13:19:26 +1100 Subject: [PATCH 10/75] add tune.spls examples to docs --- R/tune.spls.R | 17 +---------------- examples/tune.spls-examples.R | 18 ++++++++++++++++++ man/tune.spls.Rd | 28 +++++++++++++++------------- 3 files changed, 34 insertions(+), 29 deletions(-) create mode 100644 examples/tune.spls-examples.R diff --git a/R/tune.spls.R b/R/tune.spls.R index e7bd8b02..033a6a68 100644 --- a/R/tune.spls.R +++ b/R/tune.spls.R @@ -153,23 +153,8 @@ #' 4:e1845. #' @keywords regression multivariate #' @export -#' @examples +#' @example ./examples/tune.spls-examples.R #' -#' \dontrun{ -#' data(liver.toxicity) -#' X <- liver.toxicity$gene -#' Y <- liver.toxicity$clinic -#' set.seed(42) -#' tune.res = tune.spls( X, Y, ncomp = 3, -#' test.keepX = c(5, 10, 15), -#' test.keepY = c(3, 6, 8), measure = "cor", -#' folds = 5, nrepeat = 3, progressBar = TRUE) -#' tune.res$choice.ncomp -#' tune.res$choice.keepX -#' tune.res$choice.keepY -#' # plot the results -#' plot(tune.res) -#' } # TODO change this so that it simply wraps perf tune.spls <- function(X, diff --git a/examples/tune.spls-examples.R b/examples/tune.spls-examples.R new file mode 100644 index 00000000..ddee2700 --- /dev/null +++ b/examples/tune.spls-examples.R @@ -0,0 +1,18 @@ +# set up data +data(liver.toxicity) +X <- liver.toxicity$gene +Y <- liver.toxicity$clinic + +# tune spls model for components only +tune.res.ncomp <- tune.spls( X, Y, ncomp = 5, + test.keepX = NULL, + test.keepY = NULL, measure = "cor", + folds = 5, nrepeat = 3, progressBar = TRUE) +plot(tune.res.ncomp) # plot outputs + +# tune spls model for number of X and Y variables to keep +tune.res <- tune.spls( X, Y, ncomp = 3, + test.keepX = c(5, 10, 15), + test.keepY = c(3, 6, 8), measure = "cor", + folds = 5, nrepeat = 3, progressBar = TRUE) +plot(tune.res) # plot outputs \ No newline at end of file diff --git a/man/tune.spls.Rd b/man/tune.spls.Rd index 1628da4f..73bd4439 100644 --- a/man/tune.spls.Rd +++ b/man/tune.spls.Rd @@ -199,22 +199,24 @@ See also \code{?perf} for more details. } \examples{ - -\dontrun{ +# set up data data(liver.toxicity) X <- liver.toxicity$gene Y <- liver.toxicity$clinic -set.seed(42) -tune.res = tune.spls( X, Y, ncomp = 3, - test.keepX = c(5, 10, 15), - test.keepY = c(3, 6, 8), measure = "cor", - folds = 5, nrepeat = 3, progressBar = TRUE) -tune.res$choice.ncomp -tune.res$choice.keepX -tune.res$choice.keepY -# plot the results -plot(tune.res) -} + +# tune spls model for components only +tune.res.ncomp <- tune.spls( X, Y, ncomp = 5, + test.keepX = NULL, + test.keepY = NULL, measure = "cor", + folds = 5, nrepeat = 3, progressBar = TRUE) +plot(tune.res.ncomp) # plot outputs + +# tune spls model for number of X and Y variables to keep +tune.res <- tune.spls( X, Y, ncomp = 3, + test.keepX = c(5, 10, 15), + test.keepY = c(3, 6, 8), measure = "cor", + folds = 5, nrepeat = 3, progressBar = TRUE) +plot(tune.res) # plot outputs } \references{ mixOmics article: From 413de191ac18f3f44dbeb8f5fb5ac7e8e6413870 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Mon, 25 Nov 2024 13:32:40 +1100 Subject: [PATCH 11/75] update tune.spls examples to include spls1 model --- examples/tune.spls-examples.R | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/examples/tune.spls-examples.R b/examples/tune.spls-examples.R index ddee2700..e38b5968 100644 --- a/examples/tune.spls-examples.R +++ b/examples/tune.spls-examples.R @@ -1,3 +1,5 @@ +## sPLS2 model (i.e. more than one Y outcome variable) + # set up data data(liver.toxicity) X <- liver.toxicity$gene @@ -15,4 +17,15 @@ tune.res <- tune.spls( X, Y, ncomp = 3, test.keepX = c(5, 10, 15), test.keepY = c(3, 6, 8), measure = "cor", folds = 5, nrepeat = 3, progressBar = TRUE) -plot(tune.res) # plot outputs \ No newline at end of file +plot(tune.res) # plot outputs + +## sPLS1 model (i.e. only one Y outcome variable) + +# set up data +Y1 <- liver.toxicity$clinic[,1] + +# tune spls model for components only +plot(tune.spls(X, Y1, ncomp = 3, folds = 3, test.keepX = NULL, test.keepY = NULL)) + +# tune spls model for number of X variables to keep, note for sPLS1 models 'measure' needs to be set +plot(tune.spls(X, Y1, ncomp = 3, folds = 3, test.keepX = c(5, 10, 15), test.keepY = c(3, 6, 8), measure = "MSE")) \ No newline at end of file From c28de609a25bb14347548b2cab648f05befd8e78 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Mon, 25 Nov 2024 13:33:47 +1100 Subject: [PATCH 12/75] update tune.spls docs --- R/tune.spls.R | 2 +- man/tune.spls.Rd | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/R/tune.spls.R b/R/tune.spls.R index 033a6a68..440569b8 100644 --- a/R/tune.spls.R +++ b/R/tune.spls.R @@ -56,7 +56,7 @@ #' @template arg/validation #' @template arg/folds #' @template arg/nrepeat -#' @param measure The tuning measure to use. See details. +#' @param measure The tuning measure to use. Cannot be NULL when applied to sPLS1 object. See details. #' @templateVar modes \code{"regression"}, \code{"canonical"}, \code{"invariant"} or \code{"classic"} #' @template arg/mode #' @param arg/scale Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE diff --git a/man/tune.spls.Rd b/man/tune.spls.Rd index 73bd4439..e0f52969 100644 --- a/man/tune.spls.Rd +++ b/man/tune.spls.Rd @@ -52,7 +52,7 @@ details.} \item{folds}{Positive Integer, The folds in the Mfold cross-validation.} -\item{measure}{The tuning measure to use. See details.} +\item{measure}{The tuning measure to use. Cannot be NULL when applied to sPLS1 object. See details.} \item{mode}{Character string indicating the type of PLS algorithm to use. One of \code{"regression"}, \code{"canonical"}, \code{"invariant"} or \code{"classic"}. See Details.} @@ -199,6 +199,8 @@ See also \code{?perf} for more details. } \examples{ +## sPLS2 model (i.e. more than one Y outcome variable) + # set up data data(liver.toxicity) X <- liver.toxicity$gene @@ -217,6 +219,17 @@ tune.res <- tune.spls( X, Y, ncomp = 3, test.keepY = c(3, 6, 8), measure = "cor", folds = 5, nrepeat = 3, progressBar = TRUE) plot(tune.res) # plot outputs + +## sPLS1 model (i.e. only one Y outcome variable) + +# set up data +Y1 <- liver.toxicity$clinic[,1] + +# tune spls model for components only +plot(tune.spls(X, Y1, ncomp = 3, folds = 3, test.keepX = NULL, test.keepY = NULL)) + +# tune spls model for number of X variables to keep, note for sPLS1 models 'measure' needs to be set +plot(tune.spls(X, Y1, ncomp = 3, folds = 3, test.keepX = c(5, 10, 15), test.keepY = c(3, 6, 8), measure = "MSE")) } \references{ mixOmics article: From fb582674fe6249defb044c9800bef9443396932f Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Mon, 25 Nov 2024 14:01:48 +1100 Subject: [PATCH 13/75] init tune.pls --- NAMESPACE | 1 + R/tune.pls.R | 196 ++++++++++++++++++++++++++++++++++++++++++ man/tune.pls.Rd | 220 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 417 insertions(+) create mode 100644 R/tune.pls.R create mode 100644 man/tune.pls.Rd diff --git a/NAMESPACE b/NAMESPACE index ae073e9f..bec3f7ec 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -172,6 +172,7 @@ export(tune) export(tune.block.splsda) export(tune.mint.splsda) export(tune.pca) +export(tune.pls) export(tune.plsda) export(tune.rcc) export(tune.spca) diff --git a/R/tune.pls.R b/R/tune.pls.R new file mode 100644 index 00000000..8b21c7d8 --- /dev/null +++ b/R/tune.pls.R @@ -0,0 +1,196 @@ +############################################################################################################# +# Authors: +# Florian Rohart, The University of Queensland, The University of Queensland Diamantina Institute, Translational Research Institute, Brisbane, QLD +# Kim-Anh Le Cao, The University of Queensland, The University of Queensland Diamantina Institute, Translational Research Institute, Brisbane, QLD +# Benoit Gautier, The University of Queensland, The University of Queensland Diamantina Institute, Translational Research Institute, Brisbane, QLD +# Francois Bartolo, Institut National des Sciences Appliquees et Institut de Mathematiques, Universite de Toulouse et CNRS (UMR 5219), France +# +# created: 2013 +# last modified: 05-10-2017 +# +# Copyright (C) 2013 +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +############################################################################################################# + + +# ======================================================================================================== +# tune.pls: Tuning hyperparameters on a pls method +# ======================================================================================================== +#' +#' Tuning functions for PLS method +#' +#' Computes M-fold or Leave-One-Out Cross-Validation scores on a user-input +#' grid to determine optimal values for the parameters in \code{spls}. +#' +#' +#' This tuning function should be used to tune the number of components to select for spls models. +#' +#' +#' @template section/folds +#' @template section/nrepeat +#' @template section/measure-pls +#' @template section/t-test-process +#' +#' @section more: +#' See also \code{?perf} for more details. +#' +#' @param X numeric matrix of predictors with the rows as individual observations. +#' @param Y numeric matrix of response(s) with the rows as individual observations matching \code{X}. +#' @template arg/ncomp +#' @template arg/validation +#' @template arg/folds +#' @template arg/nrepeat +#' @param measure The tuning measure to use. Cannot be NULL when applied to sPLS1 object. See details. +#' @templateVar modes \code{"regression"}, \code{"canonical"}, \code{"invariant"} or \code{"classic"} +#' @template arg/mode +#' @param arg/scale Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE +#' @param arg/tol Positive numeric used as convergence criteria/tolerance during the iterative process. Default to 1e-06. +#' @param arg/max.iter Integer, the maximum number of iterations. Default to 100. +#' @param arg/near.zero.var Logical, see the internal nearZeroVar function (should be set to TRUE in particular for data with many zero values). Setting this argument to FALSE (when appropriate) will speed up the computations. Default value is FALSE. +#' @param arg/logratio Character, one of ('none','CLR') specifies the log ratio transformation to deal with compositional values that may arise from specific normalisation in sequencing data. Default to 'none'. See ?logratio.transfo for details. +#' @param arg/multilevel Numeric, design matrix for repeated measurement analysis, where multilevel decomposition is required. For a one factor decomposition, the repeated measures on each individual, i.e. the individuals ID is input as the first column. For a 2 level factor decomposition then 2nd AND 3rd columns indicate those factors. See examples. +#' @template arg/progressBar +#' @template arg/BPPARAM +#' @param seed set a number here if you want the function to give reproducible outputs. +#' Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'. +#' @param limQ2 Q2 threshold for recommending optimal \code{ncomp}. +#' @param ... Optional parameters passed to \code{\link{spls}} +#' @return +#' Returns a list with the following components for every repeat: +#' \item{MSEP}{Mean Square Error Prediction for each \eqn{Y} variable, only +#' applies to object inherited from \code{"pls"}, and \code{"spls"}. Only +#' available when in regression (s)PLS.} +#' \item{RMSEP}{Root Mean Square Error Prediction for each \eqn{Y} variable, only +#' applies to object inherited from \code{"pls"}, and \code{"spls"}. Only +#' available when in regression (s)PLS.} +#' \item{R2}{a matrix of \eqn{R^2} values of the \eqn{Y}-variables for models +#' with \eqn{1, \ldots ,}\code{ncomp} components, only applies to object +#' inherited from \code{"pls"}, and \code{"spls"}. Only available when in +#' regression (s)PLS.} +#' \item{Q2}{if \eqn{Y} contains one variable, a vector of \eqn{Q^2} values +#' else a list with a matrix of \eqn{Q^2} values for each \eqn{Y}-variable. +#' Note that in the specific case of an sPLS model, it is better to have a look +#' at the Q2.total criterion, only applies to object inherited from +#' \code{"pls"}, and \code{"spls"}. Only available when in regression (s)PLS.} +#' \item{Q2.total}{a vector of \eqn{Q^2}-total values for models with \eqn{1, +#' \ldots ,}\code{ncomp} components, only applies to object inherited from +#' \code{"pls"}, and \code{"spls"}. Available in both (s)PLS modes.} +#' \item{RSS}{Residual Sum of Squares across all selected features and the +#' components.} +#' \item{PRESS}{Predicted Residual Error Sum of Squares across all selected +#' features and the components.} +#' \item{features}{a list of features selected across the +#' folds (\code{$stable.X} and \code{$stable.Y}) for the \code{keepX} and +#' \code{keepY} parameters from the input object. Note, this will be \code{NULL} +#' if using standard (non-sparse) PLS.} +#' \item{cor.tpred, cor.upred}{Correlation between the +#' predicted and actual components for X (t) and Y (u)} +#' \item{RSS.tpred, RSS.upred}{Residual Sum of Squares between the +#' predicted and actual components for X (t) and Y (u)} +#' \item{error.rate} +#' +#' +#' @author Kim-Anh Lê Cao, Al J Abadi, Benoit Gautier, Francois Bartolo, +#' Florian Rohart, +#' @seealso \code{\link{splsda}}, \code{\link{predict.splsda}} and +#' http://www.mixOmics.org for more details. +#' @references mixOmics article: +#' +#' Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics +#' feature selection and multiple data integration. PLoS Comput Biol 13(11): +#' e1005752 +#' +#' PLS and PLS citeria for PLS regression: Tenenhaus, M. (1998). La regression +#' PLS: theorie et pratique. Paris: Editions Technic. +#' +#' Chavent, Marie and Patouille, Brigitte (2003). Calcul des coefficients de +#' regression et du PRESS en regression PLS1. Modulad n, 30 1-11. (this is the +#' formula we use to calculate the Q2 in perf.pls and perf.spls) +#' +#' Mevik, B.-H., Cederkvist, H. R. (2004). Mean Squared Error of Prediction +#' (MSEP) Estimates for Principal Component Regression (PCR) and Partial Least +#' Squares Regression (PLSR). Journal of Chemometrics 18(9), 422-429. +#' +#' sparse PLS regression mode: +#' +#' Lê Cao, K. A., Rossouw D., Robert-Granie, C. and Besse, P. (2008). A sparse +#' PLS for variable selection when integrating Omics data. Statistical +#' Applications in Genetics and Molecular Biology 7, article 35. +#' +#' One-sided t-tests (suppl material): +#' +#' Rohart F, Mason EA, Matigian N, Mosbergen R, Korn O, Chen T, Butcher S, +#' Patel J, Atkinson K, Khosrotehrani K, Fisk NM, Lê Cao K-A&, Wells CA& +#' (2016). A Molecular Classification of Human Mesenchymal Stromal Cells. PeerJ +#' 4:e1845. +#' +#' @keywords regression multivariate +#' @export +#' @example ./examples/tune.pls-examples.R +#' +tune.pls <- + function(X, + Y, + ncomp, + # params related to CV + validation = c('Mfold', 'loo'), + nrepeat = 1, + folds, + measure = NULL, + # params related to spls model building + mode = c('regression', 'canonical', 'classic'), + scale = TRUE, + logratio = "none", + tol = 1e-06, + max.iter = 100, + near.zero.var = FALSE, + multilevel = NULL, + # params related to running tune + BPPARAM = SerialParam(), + seed = NULL, + progressBar = FALSE, + limQ2 = 0.0975, + ... + ) { + + #-- checking general input parameters --------------------------------------# + #---------------------------------------------------------------------------# + out = list() + mode <- match.arg(mode) + + BPPARAM$RNGseed <- seed + + X <- .check_numeric_matrix(X, block_name = 'X') + Y <- .check_numeric_matrix(Y, block_name = 'Y') + + check_cv <- .check_cv_args(validation = validation, + nrepeat = nrepeat, folds = folds, + N = nrow(X)) + validation <- check_cv$validation + nrepeat <- check_cv$nrepeat + folds <- check_cv$folds + + #-- build model and run perf() --------------------------------------# + #---------------------------------------------------------------------------# + pls_res <- pls(X, Y, ncomp, + mode = mode, scale = scale, logratio = logratio, tol = tol, max.iter = max.iter, near.zero.var = near.zero.var, multilevel = multilevel) + perf_res <- perf(pls_res, + validation = validation, folds = folds, nrepeat = nrepeat, + dist = dist, + BPPARAM = BPPARAM, seed = seed, progressBar = progressBar) + return(perf_res) + + } \ No newline at end of file diff --git a/man/tune.pls.Rd b/man/tune.pls.Rd new file mode 100644 index 00000000..31c6a804 --- /dev/null +++ b/man/tune.pls.Rd @@ -0,0 +1,220 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tune.pls.R +\name{tune.pls} +\alias{tune.pls} +\title{Tuning functions for PLS method} +\usage{ +tune.pls( + X, + Y, + ncomp, + validation = c("Mfold", "loo"), + nrepeat = 1, + folds, + measure = NULL, + mode = c("regression", "canonical", "classic"), + scale = TRUE, + logratio = "none", + tol = 1e-06, + max.iter = 100, + near.zero.var = FALSE, + multilevel = NULL, + BPPARAM = SerialParam(), + seed = NULL, + progressBar = FALSE, + limQ2 = 0.0975, + ... +) +} +\arguments{ +\item{X}{numeric matrix of predictors with the rows as individual observations.} + +\item{Y}{numeric matrix of response(s) with the rows as individual observations matching \code{X}.} + +\item{ncomp}{Positive Integer. The number of components to include in the +model. Default to 2.} + +\item{validation}{character. What kind of (internal) validation to use, +matching one of \code{"Mfold"} or \code{"loo"} (Leave-One-out). Default is +\code{"Mfold"}.} + +\item{nrepeat}{Positive integer. Number of times the Cross-Validation process +should be repeated. \code{nrepeat > 2} is required for robust tuning. See +details.} + +\item{folds}{Positive Integer, The folds in the Mfold cross-validation.} + +\item{measure}{The tuning measure to use. Cannot be NULL when applied to sPLS1 object. See details.} + +\item{mode}{Character string indicating the type of PLS algorithm to use. One +of \code{"regression"}, \code{"canonical"}, \code{"invariant"} or \code{"classic"}. See Details.} + +\item{BPPARAM}{A \linkS4class{BiocParallelParam} object indicating the type +of parallelisation. See examples in \code{?tune.spca}.} + +\item{seed}{set a number here if you want the function to give reproducible outputs. +Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'.} + +\item{progressBar}{Logical. If \code{TRUE} a progress bar is shown as the +computation completes. Default to \code{FALSE}.} + +\item{limQ2}{Q2 threshold for recommending optimal \code{ncomp}.} + +\item{...}{Optional parameters passed to \code{\link{spls}}} + +\item{arg/scale}{Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE} + +\item{arg/tol}{Positive numeric used as convergence criteria/tolerance during the iterative process. Default to 1e-06.} + +\item{arg/max.iter}{Integer, the maximum number of iterations. Default to 100.} + +\item{arg/near.zero.var}{Logical, see the internal nearZeroVar function (should be set to TRUE in particular for data with many zero values). Setting this argument to FALSE (when appropriate) will speed up the computations. Default value is FALSE.} + +\item{arg/logratio}{Character, one of ('none','CLR') specifies the log ratio transformation to deal with compositional values that may arise from specific normalisation in sequencing data. Default to 'none'. See ?logratio.transfo for details.} + +\item{arg/multilevel}{Numeric, design matrix for repeated measurement analysis, where multilevel decomposition is required. For a one factor decomposition, the repeated measures on each individual, i.e. the individuals ID is input as the first column. For a 2 level factor decomposition then 2nd AND 3rd columns indicate those factors. See examples.} +} +\value{ +Returns a list with the following components for every repeat: +\item{MSEP}{Mean Square Error Prediction for each \eqn{Y} variable, only +applies to object inherited from \code{"pls"}, and \code{"spls"}. Only +available when in regression (s)PLS.} +\item{RMSEP}{Root Mean Square Error Prediction for each \eqn{Y} variable, only +applies to object inherited from \code{"pls"}, and \code{"spls"}. Only +available when in regression (s)PLS.} +\item{R2}{a matrix of \eqn{R^2} values of the \eqn{Y}-variables for models +with \eqn{1, \ldots ,}\code{ncomp} components, only applies to object +inherited from \code{"pls"}, and \code{"spls"}. Only available when in +regression (s)PLS.} +\item{Q2}{if \eqn{Y} contains one variable, a vector of \eqn{Q^2} values +else a list with a matrix of \eqn{Q^2} values for each \eqn{Y}-variable. +Note that in the specific case of an sPLS model, it is better to have a look +at the Q2.total criterion, only applies to object inherited from +\code{"pls"}, and \code{"spls"}. Only available when in regression (s)PLS.} +\item{Q2.total}{a vector of \eqn{Q^2}-total values for models with \eqn{1, +\ldots ,}\code{ncomp} components, only applies to object inherited from +\code{"pls"}, and \code{"spls"}. Available in both (s)PLS modes.} +\item{RSS}{Residual Sum of Squares across all selected features and the +components.} +\item{PRESS}{Predicted Residual Error Sum of Squares across all selected +features and the components.} +\item{features}{a list of features selected across the +folds (\code{$stable.X} and \code{$stable.Y}) for the \code{keepX} and +\code{keepY} parameters from the input object. Note, this will be \code{NULL} +if using standard (non-sparse) PLS.} +\item{cor.tpred, cor.upred}{Correlation between the +predicted and actual components for X (t) and Y (u)} +\item{RSS.tpred, RSS.upred}{Residual Sum of Squares between the +predicted and actual components for X (t) and Y (u)} +\item{error.rate} +} +\description{ +Computes M-fold or Leave-One-Out Cross-Validation scores on a user-input +grid to determine optimal values for the parameters in \code{spls}. +} +\details{ +This tuning function should be used to tune the number of components to select for spls models. +} +\section{folds}{ + +During a cross-validation (CV), data are randomly split into \code{M} +subgroups (folds). \code{M-1} subgroups are then used to train submodels +which would be used to predict prediction accuracy statistics for the +held-out (test) data. All subgroups are used as the test data exactly once. +If \code{validation = "loo"}, leave-one-out CV is used where each group +consists of exactly one sample and hence \code{M == N} where N is the number +of samples. +} + +\section{nrepeat}{ + +The cross-validation process is repeated \code{nrepeat} times and the +accuracy measures are averaged across repeats. If \code{validation = "loo"}, +the process does not need to be repeated as there is only one way to split N +samples into N groups and hence nrepeat is forced to be 1. +} + +\section{measure}{ + +\itemize{ +\item \bold{For PLS2} Two measures of accuracy are available: Correlation +(\code{cor}, used as default), as well as the Residual Sum of Squares +(\code{RSS}). For \code{cor}, the parameters which would maximise the +correlation between the predicted and the actual components are chosen. The +\code{RSS} measure tries to predict the held-out data by matrix +reconstruction and seeks to minimise the error between actual and predicted +values. For \code{mode='canonical'}, The X matrix is used to calculate the +\code{RSS}, while for others modes the \code{Y} matrix is used. This measure +gives more weight to any large errors and is thus sensitive to outliers. It +also intrinsically selects less number of features on the \code{Y} block +compared to \code{measure='cor'}. +\item \bold{For PLS1} Four measures of accuracy are available: Mean Absolute +Error (\code{MAE}), Mean Square Error (\code{MSE}, used as default), +\code{Bias} and \code{R2}. Both MAE and MSE average the model prediction +error. MAE measures the average magnitude of the errors without considering +their direction. It is the average over the fold test samples of the absolute +differences between the Y predictions and the actual Y observations. The MSE +also measures the average magnitude of the error. Since the errors are +squared before they are averaged, the MSE tends to give a relatively high +weight to large errors. The Bias is the average of the differences between +the Y predictions and the actual Y observations and the R2 is the correlation +between the predictions and the observations. +} +} + +\section{Optimisation Process}{ + +The optimisation process is data-driven and similar to the process detailed +in (Rohart et al., 2016), where one-sided t-tests assess whether there is a +gain in performance when incrementing the number of features or components in +the model. However, it will assess all the provided grid through pair-wise +comparisons as the performance criteria do not always change linearly with +respect to the added number of features or components. +} + +\section{more}{ + +See also \code{?perf} for more details. +} + +\references{ +mixOmics article: + +Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics +feature selection and multiple data integration. PLoS Comput Biol 13(11): +e1005752 + +PLS and PLS citeria for PLS regression: Tenenhaus, M. (1998). La regression +PLS: theorie et pratique. Paris: Editions Technic. + +Chavent, Marie and Patouille, Brigitte (2003). Calcul des coefficients de +regression et du PRESS en regression PLS1. Modulad n, 30 1-11. (this is the +formula we use to calculate the Q2 in perf.pls and perf.spls) + +Mevik, B.-H., Cederkvist, H. R. (2004). Mean Squared Error of Prediction +(MSEP) Estimates for Principal Component Regression (PCR) and Partial Least +Squares Regression (PLSR). Journal of Chemometrics 18(9), 422-429. + +sparse PLS regression mode: + +Lê Cao, K. A., Rossouw D., Robert-Granie, C. and Besse, P. (2008). A sparse +PLS for variable selection when integrating Omics data. Statistical +Applications in Genetics and Molecular Biology 7, article 35. + +One-sided t-tests (suppl material): + +Rohart F, Mason EA, Matigian N, Mosbergen R, Korn O, Chen T, Butcher S, +Patel J, Atkinson K, Khosrotehrani K, Fisk NM, Lê Cao K-A&, Wells CA& +(2016). A Molecular Classification of Human Mesenchymal Stromal Cells. PeerJ +4:e1845. +} +\seealso{ +\code{\link{splsda}}, \code{\link{predict.splsda}} and +http://www.mixOmics.org for more details. +} +\author{ +Kim-Anh Lê Cao, Al J Abadi, Benoit Gautier, Francois Bartolo, +Florian Rohart, +} +\keyword{multivariate} +\keyword{regression} From da6ca9037f7c14ea5c4fd0c22ef20d3f9311ef06 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Mon, 25 Nov 2024 14:02:38 +1100 Subject: [PATCH 14/75] add tune.pls examples --- examples/tune.pls-examples.R | 17 +++++++++++++++++ man/tune.pls.Rd | 19 +++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 examples/tune.pls-examples.R diff --git a/examples/tune.pls-examples.R b/examples/tune.pls-examples.R new file mode 100644 index 00000000..3afb183d --- /dev/null +++ b/examples/tune.pls-examples.R @@ -0,0 +1,17 @@ +# set up data +data(liver.toxicity) +X <- liver.toxicity$gene +Y <- liver.toxicity$clinic + +# tune pls model to find optimal number of components +tune.res <- tune.pls( X, Y, ncomp = 10, measure = "cor", + folds = 5, nrepeat = 3, progressBar = TRUE) +plot(tune.res) # plot outputs + +# PLS1 model +Y1 <- liver.toxicity$clinic[,1] + +tune.res <- tune.pls( X, Y1, ncomp = 10, measure = "cor", + folds = 5, nrepeat = 3, progressBar = TRUE) + +plot(tune.res) \ No newline at end of file diff --git a/man/tune.pls.Rd b/man/tune.pls.Rd index 31c6a804..91cdc27d 100644 --- a/man/tune.pls.Rd +++ b/man/tune.pls.Rd @@ -177,6 +177,25 @@ respect to the added number of features or components. See also \code{?perf} for more details. } +\examples{ +# set up data +data(liver.toxicity) +X <- liver.toxicity$gene +Y <- liver.toxicity$clinic + +# tune pls model to find optimal number of components +tune.res <- tune.pls( X, Y, ncomp = 10, measure = "cor", + folds = 5, nrepeat = 3, progressBar = TRUE) +plot(tune.res) # plot outputs + +# PLS1 model +Y1 <- liver.toxicity$clinic[,1] + +tune.res <- tune.pls( X, Y1, ncomp = 10, measure = "cor", + folds = 5, nrepeat = 3, progressBar = TRUE) + +plot(tune.res) +} \references{ mixOmics article: From dceeecdf68c286df4800c507ba4396747e5c9140 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Mon, 25 Nov 2024 17:30:31 +1100 Subject: [PATCH 15/75] update tune.block.splsda docs and exmples --- R/tune.block.splsda.R | 543 ++++++++++++++------------ examples/tune.block.splsda-examples.R | 92 +++-- man/tune.block.splsda.Rd | 208 ++++++---- 3 files changed, 475 insertions(+), 368 deletions(-) diff --git a/R/tune.block.splsda.R b/R/tune.block.splsda.R index 41661650..bbabae2c 100644 --- a/R/tune.block.splsda.R +++ b/R/tune.block.splsda.R @@ -1,15 +1,15 @@ # ======================================================================================================== -# tune.splsda: chose the optimal number of parameters per component on a splsda method +# tune.block.splsda: Tuning hyperparameters on a block splsda method # ======================================================================================================== #' Tuning function for block.splsda method (N-integration with sparse #' Discriminant Analysis) #' #' Computes M-fold or Leave-One-Out Cross-Validation scores based on a -#' user-input grid to determine the optimal parsity parameters values for +#' user-input grid to determine the optimal parameters for #' method \code{block.splsda}. #' -#' This tuning function should be used to tune the keepX parameters in the -#' \code{block.splsda} function (N-integration with sparse Discriminant +#' This tuning function should be used to tune the number of components and the +#' keepX parameters in the \code{block.splsda} function (N-integration with sparse Discriminant #' Analysis). #' #' M-fold or LOO cross-validation is performed with stratified subsampling @@ -39,7 +39,7 @@ #' @param test.keepX A named list with the same length and names as X #' (without the outcome Y, if it is provided in X and designated using #' \code{indY}). Each entry of this list is a numeric vector for the different -#' keepX values to test for that specific block. +#' keepX values to test for that specific block. If set to NULL, ncomp is tuned. #' @param already.tested.X Optional, if \code{ncomp > 1} A named list of #' numeric vectors each of length \code{n_tested} indicating the number of #' variables to select from the \eqn{X} data set on the first \code{n_tested} @@ -76,6 +76,45 @@ #' #' \item{cor.value}{compute the correlation between latent variables for #' two-factor sPLS-DA analysis.} +#' +#' If \code{test.keepX = NULL}, returns: +#' \item{error.rate}{Prediction error rate for each block of \code{object$X} +#' and each \code{dist}} \item{error.rate.per.class}{Prediction error rate for +#' each block of \code{object$X}, each \code{dist} and each class} +#' \item{predict}{Predicted values of each sample for each class, each block +#' and each component} \item{class}{Predicted class of each sample for each +#' block, each \code{dist}, each component and each nrepeat} \item{features}{a +#' list of features selected across the folds (\code{$stable.X} and +#' \code{$stable.Y}) for the \code{keepX} and \code{keepY} parameters from the +#' input object.} \item{AveragedPredict.class}{if more than one block, returns +#' the average predicted class over the blocks (averaged of the \code{Predict} +#' output and prediction using the \code{max.dist} distance)} +#' \item{AveragedPredict.error.rate}{if more than one block, returns the +#' average predicted error rate over the blocks (using the +#' \code{AveragedPredict.class} output)} \item{WeightedPredict.class}{if more +#' than one block, returns the weighted predicted class over the blocks +#' (weighted average of the \code{Predict} output and prediction using the +#' \code{max.dist} distance). See details for more info on weights.} +#' \item{WeightedPredict.error.rate}{if more than one block, returns the +#' weighted average predicted error rate over the blocks (using the +#' \code{WeightedPredict.class} output.)} \item{MajorityVote}{if more than one +#' block, returns the majority class over the blocks. NA for a sample means that +#' there is no consensus on the predicted class for this particular sample over +#' the blocks.} \item{MajorityVote.error.rate}{if more than one block, returns +#' the error rate of the \code{MajorityVote} output} +#' \item{WeightedVote}{if more than one block, returns the weighted majority +#' class over the blocks. NA for a sample means that there is no consensus on +#' the predicted class for this particular sample over the blocks.} +#' \item{WeightedVote.error.rate}{if more than one block, returns the error +#' rate of the \code{WeightedVote} output} \item{weights}{Returns the weights +#' of each block used for the weighted predictions, for each nrepeat and each +#' fold} \item{choice.ncomp}{For supervised models; returns the optimal number +#' of components for the model for each prediction distance using one-sided +#' t-tests that test for a significant difference in the mean error rate (gain +#' in prediction) when components are added to the model. See more details in +#' Rohart et al 2017 Suppl. For more than one block, an optimal ncomp is +#' returned for each prediction framework.} +#' #' @author Florian Rohart, Amrit Singh, Kim-Anh Lê Cao, AL J Abadi #' @seealso \code{\link{block.splsda}} and http://www.mixOmics.org for more #' details. @@ -93,29 +132,33 @@ #' @export #' @example ./examples/tune.block.splsda-examples.R tune.block.splsda <- - function (X, + function ( + # basic params + X, Y, indY, ncomp = 2, - test.keepX, - already.tested.X, - validation = "Mfold", - folds = 10, - dist = "max.dist", - measure = "BER", - # one of c("overall","BER") - weighted = TRUE, - # optimise the weighted or not-weighted prediction - progressBar = FALSE, + # model building params tol = 1e-06, max.iter = 100, near.zero.var = FALSE, - nrepeat = 1, design, scale = TRUE, - light.output = TRUE, - # if FALSE, output the prediction and classification of each sample during each folds, on each comp, for each repeat + # sparse method params + test.keepX, + already.tested.X, + # cross validation params + validation = "Mfold", + folds = 10, + nrepeat = 1, signif.threshold=0.01, + # measure of performance params + dist = "max.dist", + measure = "BER", # one of c("overall","BER") + weighted = TRUE, # optimise the weighted or not-weighted prediction + # processing params + progressBar = FALSE, + light.output = TRUE, # if FALSE, output the prediction and classification of each sample during each folds, on each comp, for each repeat BPPARAM = SerialParam(), seed = NULL, ...) @@ -248,262 +291,276 @@ tune.block.splsda <- ) } + #-- validation if (any(is.na(validation)) || length(validation) > 1) stop("'validation' should be one of 'Mfold' or 'loo'.", call. = FALSE) #-- test.keepX - if (missing(test.keepX)) - { - test.keepX = lapply(X, function(x) { - max.test.keepX <- min(30, ncol(x)) - if (max.test.keepX > 15) - return(seq(5, max.test.keepX, 5)) - else - return(seq(1, max.test.keepX, 2)) - }) - + if (is.null(test.keepX)){ + print("test.keepX is set to NULL, tuning only for number of components...") + block.splsda_res <- block.splsda(X, Y, ncomp = ncomp, ncomp, keepX = ncol(X), + scale = scale, tol = tol, max.iter = max.iter, near.zero.var = near.zero.var, design = design) + perf_res <- perf(block.splsda_res, + validation = validation, folds = folds, nrepeat = nrepeat, + dist = dist, + BPPARAM = BPPARAM, seed = seed, progressBar = progressBar) + return(perf_res) + } else { - if (length(test.keepX) != length(X)) - stop( - paste( - "test.keepX should be a list of length ", - length(X), - ", corresponding to the blocks: ", - paste(names(X), collapse = ", "), - sep = "" + + if (missing(test.keepX)) + { + test.keepX = lapply(X, function(x) { + max.test.keepX <- min(30, ncol(x)) + if (max.test.keepX > 15) + return(seq(5, max.test.keepX, 5)) + else + return(seq(1, max.test.keepX, 2)) + }) + + } else { + if (length(test.keepX) != length(X)) + stop( + paste( + "test.keepX should be a list of length ", + length(X), + ", corresponding to the blocks: ", + paste(names(X), collapse = ", "), + sep = "" + ) ) - ) + + #aa = sapply(test.keepX, length) + #if (any(is.null(aa) | aa == 1 | !is.numeric(aa))) + #stop("Each entry of 'test.keepX' must be a numeric vector with more than two values", call. = FALSE) + + } - #aa = sapply(test.keepX, length) - #if (any(is.null(aa) | aa == 1 | !is.numeric(aa))) - #stop("Each entry of 'test.keepX' must be a numeric vector with more than two values", call. = FALSE) + l = sapply(test.keepX, length) + n = names(test.keepX) + temp = data.frame(l, n) - } - - l = sapply(test.keepX, length) - n = names(test.keepX) - temp = data.frame(l, n) - - - message( - paste( - "\nYou have provided a sequence of keepX of length: ", - paste(apply(temp, 1, function(x) - paste(x, collapse = " for block ")), collapse = " and "), - ".\nThis results in ", - prod(sapply(test.keepX, length)), - " models being fitted for each component and each nrepeat, this may take some time to run, be patient!", - sep = "" + + message( + paste( + "\nYou have provided a sequence of keepX of length: ", + paste(apply(temp, 1, function(x) + paste(x, collapse = " for block ")), collapse = " and "), + ".\nThis results in ", + prod(sapply(test.keepX, length)), + " models being fitted for each component and each nrepeat, this may take some time to run, be patient!", + sep = "" + ) ) - ) - - if (is (BPPARAM, 'SerialParam')) - { - message(paste0( - "\nYou can look into the 'BPPARAM' argument to speed up computation time." - )) - } else { - if (progressBar == TRUE) + if (is (BPPARAM, 'SerialParam')) + { message(paste0( - "\nAs code is running in parallel, the progressBar is not available." + "\nYou can look into the 'BPPARAM' argument to speed up computation time." )) - } - - ## ----------- END checks -----------# - - ## ----------- NA calculation ----------- - - misdata = c(sapply(X, anyNA), Y = FALSE) # Detection of missing data. we assume no missing values in the factor Y - - is.na.A = vector("list", length = length(X)) - for (q in seq_along(X)) - { - if (misdata[q]) + + } else { + if (progressBar == TRUE) + message(paste0( + "\nAs code is running in parallel, the progressBar is not available." + )) + } + + ## ----------- END checks -----------# + + ## ----------- NA calculation ----------- + + misdata = c(sapply(X, anyNA), Y = FALSE) # Detection of missing data. we assume no missing values in the factor Y + + is.na.A = vector("list", length = length(X)) + for (q in seq_along(X)) { - is.na.A[[q]] = is.na(X[[q]]) - #ind.NA[[q]] = which(apply(is.na.A[[q]], 1, sum) > 0) # calculated only once - #ind.NA.col[[q]] = which(apply(is.na.A[[q]], 2, sum) >0) # indice of the col that have missing values. used in the deflation + if (misdata[q]) + { + is.na.A[[q]] = is.na(X[[q]]) + #ind.NA[[q]] = which(apply(is.na.A[[q]], 1, sum) > 0) # calculated only once + #ind.NA.col[[q]] = which(apply(is.na.A[[q]], 2, sum) >0) # indice of the col that have missing values. used in the deflation + } } - } - - ## ----------- END NA calculation ----------- # - - - # if some components have already been tuned (eg comp1 and comp2), we're only tuning the following ones (comp3 comp4 .. ncomp) - if ((!is.null(already.tested.X)) & length(already.tested.X) > 0) - { - comp.real = (length(already.tested.X[[1]]) + 1):ncomp - #check and match already.tested.X to X - if (length(already.tested.X[[1]]) > 0) + + ## ----------- END NA calculation ----------- # + + + # if some components have already been tuned (eg comp1 and comp2), we're only tuning the following ones (comp3 comp4 .. ncomp) + if ((!is.null(already.tested.X)) & length(already.tested.X) > 0) { - if (length(unique(names(already.tested.X))) != length(already.tested.X) | - sum(is.na(match(names( - already.tested.X - ), names(X)))) > 0) - stop( - "Each entry of 'already.tested.X' must have a unique name corresponding to a block of 'X'" - ) + comp.real = (length(already.tested.X[[1]]) + 1):ncomp + #check and match already.tested.X to X + if (length(already.tested.X[[1]]) > 0) + { + if (length(unique(names(already.tested.X))) != length(already.tested.X) | + sum(is.na(match(names( + already.tested.X + ), names(X)))) > 0) + stop( + "Each entry of 'already.tested.X' must have a unique name corresponding to a block of 'X'" + ) + + } + } else { + comp.real = seq_len(ncomp) } - } else { - comp.real = seq_len(ncomp) - } - - # near zero var on the whole data sets. It will be performed inside each fold as well - if (near.zero.var == TRUE) - { - nzv.A = lapply(X, nearZeroVar) - for (q in seq_along(X)) + # near zero var on the whole data sets. It will be performed inside each fold as well + if (near.zero.var == TRUE) { - if (length(nzv.A[[q]]$Position) > 0) + nzv.A = lapply(X, nearZeroVar) + for (q in seq_along(X)) { - names.remove.X = colnames(X[[q]])[nzv.A[[q]]$Position] - X[[q]] = X[[q]][, -nzv.A[[q]]$Position, drop = FALSE] - warning( - "Zero- or near-zero variance predictors.\n Reset predictors matrix to not near-zero variance predictors.\n See $nzv for problematic predictors." - ) - if (ncol(X[[q]]) == 0) - stop(paste0("No more variables in", X[[q]])) + if (length(nzv.A[[q]]$Position) > 0) + { + names.remove.X = colnames(X[[q]])[nzv.A[[q]]$Position] + X[[q]] = X[[q]][, -nzv.A[[q]]$Position, drop = FALSE] + warning( + "Zero- or near-zero variance predictors.\n Reset predictors matrix to not near-zero variance predictors.\n See $nzv for problematic predictors." + ) + if (ncol(X[[q]]) == 0) + stop(paste0("No more variables in", X[[q]])) + + #need to check that the keepA[[q]] is now not higher than ncol(A[[q]]) + if (any(test.keepX[[q]] > ncol(X[[q]]))) + test.keepX[[q]][which(test.keepX[[q]] > ncol(X[[q]]))] = ncol(X[[q]]) + } - #need to check that the keepA[[q]] is now not higher than ncol(A[[q]]) - if (any(test.keepX[[q]] > ncol(X[[q]]))) - test.keepX[[q]][which(test.keepX[[q]] > ncol(X[[q]]))] = ncol(X[[q]]) } - } - } - N.test.keepX = nrow(expand.grid(test.keepX)) - - mat.error.rate = list() - - mat.sd.error = matrix(0, - nrow = N.test.keepX, - ncol = ncomp - length(already.tested.X[[1]])) - - mat.mean.error = matrix(nrow = N.test.keepX, + N.test.keepX = nrow(expand.grid(test.keepX)) + + mat.error.rate = list() + + mat.sd.error = matrix(0, + nrow = N.test.keepX, ncol = ncomp - length(already.tested.X[[1]])) - - - mat.error.rate = list() - error.per.class.keepX.opt = list() - error.per.class.keepX.opt.mean = matrix( - 0, - nrow = nlevels(Y), - ncol = length(comp.real), - dimnames = list(c(levels(Y)), c(paste0('comp', comp.real))) - ) - - error.opt.per.comp = matrix( - nrow = nrepeat, - ncol = length(comp.real), - dimnames = list(paste("nrep", seq_len(nrepeat), sep = "."), paste0("comp", comp.real)) - ) - - if (light.output == FALSE) - class.all = list() - - ## ----------- tune components ----------- - - # successively tune the components until ncomp: comp1, then comp2, ... - for (comp in seq_along(comp.real)) - { - tune_comp <- comp.real[comp] - if (progressBar == TRUE) - cat(sprintf("\ntuning component %s\n", tune_comp)) - - result = MCVfold.block.splsda( - X, - Y, - validation = validation, - folds = folds, - nrepeat = nrepeat, - ncomp = tune_comp, - choice.keepX = already.tested.X, - scheme = scheme, - design = design, - init = init, - tol = tol, - test.keepX = test.keepX, - measure = measure, - dist = dist, - scale = scale, - weighted = weighted, - near.zero.var = near.zero.var, - progressBar = progressBar, - max.iter = max.iter, - misdata = misdata, - is.na.A = is.na.A, - BPPARAM = BPPARAM + + mat.mean.error = matrix(nrow = N.test.keepX, + ncol = ncomp - length(already.tested.X[[1]])) + + + mat.error.rate = list() + error.per.class.keepX.opt = list() + error.per.class.keepX.opt.mean = matrix( + 0, + nrow = nlevels(Y), + ncol = length(comp.real), + dimnames = list(c(levels(Y)), c(paste0('comp', comp.real))) ) + error.opt.per.comp = matrix( + nrow = nrepeat, + ncol = length(comp.real), + dimnames = list(paste("nrep", seq_len(nrepeat), sep = "."), paste0("comp", comp.real)) + ) - ## returns error.rate for all test.keepX + if (light.output == FALSE) + class.all = list() - # in the following, there is [[1]] because 'tune' is working with only 1 distance and 'MCVfold.block.splsda' can work with multiple distances - mat.error.rate[[comp]] = result[[measure]]$mat.error.rate[[1]] - mat.mean.error[, comp] = result[[measure]]$error.rate.mean[[1]] - if (!is.null(result[[measure]]$error.rate.sd[[1]])) - mat.sd.error[, comp] = result[[measure]]$error.rate.sd[[1]] + ## ----------- tune components ----------- - # confusion matrix for keepX.opt - error.per.class.keepX.opt[[comp]] = result[[measure]]$confusion[[1]] - error.per.class.keepX.opt.mean[, comp] = apply(result[[measure]]$confusion[[1]], 1, mean) + # successively tune the components until ncomp: comp1, then comp2, ... + for (comp in seq_along(comp.real)) + { + tune_comp <- comp.real[comp] + if (progressBar == TRUE) + cat(sprintf("\ntuning component %s\n", tune_comp)) + + result = MCVfold.block.splsda( + X, + Y, + validation = validation, + folds = folds, + nrepeat = nrepeat, + ncomp = tune_comp, + choice.keepX = already.tested.X, + scheme = scheme, + design = design, + init = init, + tol = tol, + test.keepX = test.keepX, + measure = measure, + dist = dist, + scale = scale, + weighted = weighted, + near.zero.var = near.zero.var, + progressBar = progressBar, + max.iter = max.iter, + misdata = misdata, + is.na.A = is.na.A, + BPPARAM = BPPARAM + ) + + + ## returns error.rate for all test.keepX + + # in the following, there is [[1]] because 'tune' is working with only 1 distance and 'MCVfold.block.splsda' can work with multiple distances + mat.error.rate[[comp]] = result[[measure]]$mat.error.rate[[1]] + mat.mean.error[, comp] = result[[measure]]$error.rate.mean[[1]] + if (!is.null(result[[measure]]$error.rate.sd[[1]])) + mat.sd.error[, comp] = result[[measure]]$error.rate.sd[[1]] + + # confusion matrix for keepX.opt + error.per.class.keepX.opt[[comp]] = result[[measure]]$confusion[[1]] + error.per.class.keepX.opt.mean[, comp] = apply(result[[measure]]$confusion[[1]], 1, mean) + + # error rate for best keepX + error.opt.per.comp[, comp] = mat.error.rate[[comp]][result[[measure]]$ind.keepX.opt[[1]], ] + + # best keepX + already.tested.X = result[[measure]]$choice.keepX + + if (light.output == FALSE) + { + #prediction of each samples for each fold and each repeat, on each comp + class.all[[comp]] = result$class.comp[[1]] + } + } - # error rate for best keepX - error.opt.per.comp[, comp] = mat.error.rate[[comp]][result[[measure]]$ind.keepX.opt[[1]], ] + ## ----------- END tune components ----------- # - # best keepX - already.tested.X = result[[measure]]$choice.keepX + ## ----------- output ----------- - if (light.output == FALSE) + rownames(mat.mean.error) = rownames(result[[measure]]$mat.error.rate[[1]]) + colnames(mat.mean.error) = paste0("comp", comp.real) + names(mat.error.rate) = c(paste0("comp", comp.real)) + names(error.per.class.keepX.opt) = c(paste0("comp", comp.real)) + if (nrepeat > 1) { - #prediction of each samples for each fold and each repeat, on each comp - class.all[[comp]] = result$class.comp[[1]] + rownames(mat.sd.error) = rownames(result[[measure]]$mat.error.rate[[1]]) + colnames(mat.sd.error) = paste0("comp", comp.real) } + + + # calculating the number of optimal component based on t.tests and the error.rate.all, if more than 3 error.rates(repeat>3) + if (nrepeat > 2 & length(comp.real) > 1) + { + error.keepX = error.opt.per.comp + opt = t.test.process(error.opt.per.comp, alpha = signif.threshold) + ncomp_opt = comp.real[opt] + } else { + ncomp_opt = error.keepX = NULL + } + + + result = list( + error.rate = mat.mean.error, + error.rate.sd = mat.sd.error, + error.rate.all = mat.error.rate, + choice.keepX = already.tested.X, + choice.ncomp = list(ncomp = ncomp_opt, values = error.keepX), + error.rate.class = error.per.class.keepX.opt + ) + + result$measure = measure.input + result$call = match.call() + + class(result) = "tune.block.splsda" + + return(result) + } - - ## ----------- END tune components ----------- # - - ## ----------- output ----------- - - rownames(mat.mean.error) = rownames(result[[measure]]$mat.error.rate[[1]]) - colnames(mat.mean.error) = paste0("comp", comp.real) - names(mat.error.rate) = c(paste0("comp", comp.real)) - names(error.per.class.keepX.opt) = c(paste0("comp", comp.real)) - if (nrepeat > 1) - { - rownames(mat.sd.error) = rownames(result[[measure]]$mat.error.rate[[1]]) - colnames(mat.sd.error) = paste0("comp", comp.real) - } - - - # calculating the number of optimal component based on t.tests and the error.rate.all, if more than 3 error.rates(repeat>3) - if (nrepeat > 2 & length(comp.real) > 1) - { - error.keepX = error.opt.per.comp - opt = t.test.process(error.opt.per.comp, alpha = signif.threshold) - ncomp_opt = comp.real[opt] - } else { - ncomp_opt = error.keepX = NULL - } - - - result = list( - error.rate = mat.mean.error, - error.rate.sd = mat.sd.error, - error.rate.all = mat.error.rate, - choice.keepX = already.tested.X, - choice.ncomp = list(ncomp = ncomp_opt, values = error.keepX), - error.rate.class = error.per.class.keepX.opt - ) - - result$measure = measure.input - result$call = match.call() - - class(result) = "tune.block.splsda" - - return(result) - - } + } \ No newline at end of file diff --git a/examples/tune.block.splsda-examples.R b/examples/tune.block.splsda-examples.R index 67ce833a..251b67e2 100644 --- a/examples/tune.block.splsda-examples.R +++ b/examples/tune.block.splsda-examples.R @@ -1,58 +1,64 @@ -\dontrun{ +## Set up data + +# load data data("breast.TCGA") -# this is the X data as a list of mRNA and miRNA; the Y data set is a single data set of proteins -data = list(mrna = breast.TCGA$data.train$mrna, mirna = breast.TCGA$data.train$mirna, - protein = breast.TCGA$data.train$protein) + +# X data - list of mRNA and miRNA +X <- list(mrna = breast.TCGA$data.train$mrna, mirna = breast.TCGA$data.train$mirna, + protein = breast.TCGA$data.train$protein) + +# Y data - single data set of proteins +Y <- breast.TCGA$data.train$subtype + +# subset the X and Y data to speed up computation in this example +set.seed(100) +subset <- mixOmics:::stratified.subsampling(breast.TCGA$data.train$subtype, folds = 3)[[1]][[1]] +X <- lapply(X, function(omic) omic[subset,]) +Y <- Y[subset] + # set up a full design where every block is connected # could also consider other weights, see our mixOmics manuscript design = matrix(1, ncol = length(data), nrow = length(data), dimnames = list(names(data), names(data))) diag(design) = 0 design -# set number of component per data set -ncomp = 3 -# Tuning the first two components -# ------------- -## Not run: +## Tune number of components to keep +tune_res <- tune.block.splsda(X, Y, design = design, + ncomp = 5, + test.keepX = NULL, + nrepeat = 3, + seed = 13) + +plot(tune_res) + +tune_res$choice.ncomp # 3 components best + +## Tune number of variables to keep + # definition of the keepX value to be tested for each block mRNA miRNA and protein # names of test.keepX must match the names of 'data' test.keepX = list(mrna = c(10, 30), mirna = c(15, 25), protein = c(4, 8)) -# the following may take some time to run, so we subset the data first. -# Note that for thorough tuning, nrepeat should be >= 3 so that significance of -# the model improvement can be measured -## ---- subset by 3rd of samples -set.seed(100) -subset <- mixOmics:::stratified.subsampling(breast.TCGA$data.train$subtype, folds = 3)[[1]][[1]] -data <- lapply(data, function(omic) omic[subset,]) -Y <- breast.TCGA$data.train$subtype[subset] -## ---- run -# Check if the environment variable exists (during R CMD check) and limit cores accordingly -max_cores <- if (Sys.getenv("_R_CHECK_LIMIT_CORES_") != "") 2 else parallel::detectCores() - 1 -# Setup the parallel backend with the appropriate number of workers -BPPARAM <- BiocParallel::MulticoreParam(workers = max_cores) - -tune <- tune.block.splsda( - X = data, - Y = Y, - ncomp = ncomp, - test.keepX = test.keepX, - design = design, - nrepeat = 2, - BPPARAM = BPPARAM -) - -plot(tune) -tune$choice.ncomp -tune$choice.keepX +# load parallel package +library(BiocParallel) + +# run tuning in parallel on 2 cores +tune_res <- tune.block.splsda(X, Y, design = design, + ncomp = 2, + test.keepX = test.keepX, + nrepeat = 3, + seed = 13, + BPPARAM = SnowParam(workers = 2)) + +plot(tune_res) +tune_res$choice.keepX # Now tuning a new component given previous tuned keepX -already.tested.X = tune$choice.keepX -tune = tune.block.splsda(X = data, Y = Y, - ncomp = 4, test.keepX = test.keepX, design = design, +already.tested.X <- tune_res$choice.keepX +tune_res <- tune.block.splsda(X, Y, design = design, + ncomp = 3, + test.keepX = test.keepX, already.tested.X = already.tested.X, - BPPARAM = BPPARAM - ) -tune$choice.keepX -} \ No newline at end of file + BPPARAM = SnowParam(workers = 2)) +tune_res$choice.keepX \ No newline at end of file diff --git a/man/tune.block.splsda.Rd b/man/tune.block.splsda.Rd index f36a7cbb..a43e0489 100644 --- a/man/tune.block.splsda.Rd +++ b/man/tune.block.splsda.Rd @@ -10,22 +10,22 @@ tune.block.splsda( Y, indY, ncomp = 2, + tol = 1e-06, + max.iter = 100, + near.zero.var = FALSE, + design, + scale = TRUE, test.keepX, already.tested.X, validation = "Mfold", folds = 10, + nrepeat = 1, + signif.threshold = 0.01, dist = "max.dist", measure = "BER", weighted = TRUE, progressBar = FALSE, - tol = 1e-06, - max.iter = 100, - near.zero.var = FALSE, - nrepeat = 1, - design, - scale = TRUE, light.output = TRUE, - signif.threshold = 0.01, BPPARAM = SerialParam(), seed = NULL, ... @@ -44,10 +44,33 @@ the matrix response in the list \code{X}.} \item{ncomp}{the number of components to include in the model. Default to 2. Applies to all blocks.} +\item{tol}{Positive numeric used as convergence criteria/tolerance during the +iterative process. Default to \code{1e-06}.} + +\item{max.iter}{Integer, the maximum number of iterations. Default to 100.} + +\item{near.zero.var}{Logical, see the internal \code{\link{nearZeroVar}} +function (should be set to TRUE in particular for data with many zero +values). Setting this argument to FALSE (when appropriate) will speed up the +computations. Default value is FALSE.} + +\item{design}{numeric matrix of size (number of blocks in X) x (number of +blocks in X) with values between 0 and 1. Each value indicates the strenght +of the relationship to be modelled between two blocks; a value of 0 +indicates no relationship, 1 is the maximum value. Alternatively, one of +c('null', 'full') indicating a disconnected or fully connected design, +respecively, or a numeric between 0 and 1 which will designate all +off-diagonal elements of a fully connected design (see examples in +\code{block.splsda}). If \code{Y} is provided instead of \code{indY}, the +\code{design} matrix is changed to include relationships to \code{Y}.} + +\item{scale}{Logical. If scale = TRUE, each block is standardized to zero +means and unit variances (default: TRUE)} + \item{test.keepX}{A named list with the same length and names as X (without the outcome Y, if it is provided in X and designated using \code{indY}). Each entry of this list is a numeric vector for the different -keepX values to test for that specific block.} +keepX values to test for that specific block. If set to NULL, ncomp is tuned.} \item{already.tested.X}{Optional, if \code{ncomp > 1} A named list of numeric vectors each of length \code{n_tested} indicating the number of @@ -60,6 +83,12 @@ matching one of \code{"Mfold"} or \code{"loo"} (see below). Default is \item{folds}{the folds in the Mfold cross-validation. See Details.} +\item{nrepeat}{Number of times the Cross-Validation process is repeated.} + +\item{signif.threshold}{numeric between 0 and 1 indicating the significance +threshold required for improvement in error rate of the components. Default +to 0.01.} + \item{dist}{distance metric to estimate the classification error rate, should be a subset of \code{"centroids.dist"}, \code{"mahalanobis.dist"} or \code{"max.dist"} (see Details).} @@ -72,38 +101,9 @@ the Weighted vote.} \item{progressBar}{by default set to \code{TRUE} to output the progress bar of the computation.} -\item{tol}{Positive numeric used as convergence criteria/tolerance during the -iterative process. Default to \code{1e-06}.} - -\item{max.iter}{Integer, the maximum number of iterations. Default to 100.} - -\item{near.zero.var}{Logical, see the internal \code{\link{nearZeroVar}} -function (should be set to TRUE in particular for data with many zero -values). Setting this argument to FALSE (when appropriate) will speed up the -computations. Default value is FALSE.} - -\item{nrepeat}{Number of times the Cross-Validation process is repeated.} - -\item{design}{numeric matrix of size (number of blocks in X) x (number of -blocks in X) with values between 0 and 1. Each value indicates the strenght -of the relationship to be modelled between two blocks; a value of 0 -indicates no relationship, 1 is the maximum value. Alternatively, one of -c('null', 'full') indicating a disconnected or fully connected design, -respecively, or a numeric between 0 and 1 which will designate all -off-diagonal elements of a fully connected design (see examples in -\code{block.splsda}). If \code{Y} is provided instead of \code{indY}, the -\code{design} matrix is changed to include relationships to \code{Y}.} - -\item{scale}{Logical. If scale = TRUE, each block is standardized to zero -means and unit variances (default: TRUE)} - \item{light.output}{if set to FALSE, the prediction/classification of each sample for each of \code{test.keepX} and each comp is returned.} -\item{signif.threshold}{numeric between 0 and 1 indicating the significance -threshold required for improvement in error rate of the components. Default -to 0.01.} - \item{BPPARAM}{A \linkS4class{BiocParallelParam} object indicating the type of parallelisation. See examples.} @@ -139,15 +139,53 @@ comp and each repeat. Only if light.output=FALSE} \item{cor.value}{compute the correlation between latent variables for two-factor sPLS-DA analysis.} + +If \code{test.keepX = NULL}, returns: +\item{error.rate}{Prediction error rate for each block of \code{object$X} +and each \code{dist}} \item{error.rate.per.class}{Prediction error rate for +each block of \code{object$X}, each \code{dist} and each class} +\item{predict}{Predicted values of each sample for each class, each block +and each component} \item{class}{Predicted class of each sample for each +block, each \code{dist}, each component and each nrepeat} \item{features}{a +list of features selected across the folds (\code{$stable.X} and +\code{$stable.Y}) for the \code{keepX} and \code{keepY} parameters from the +input object.} \item{AveragedPredict.class}{if more than one block, returns +the average predicted class over the blocks (averaged of the \code{Predict} +output and prediction using the \code{max.dist} distance)} +\item{AveragedPredict.error.rate}{if more than one block, returns the +average predicted error rate over the blocks (using the +\code{AveragedPredict.class} output)} \item{WeightedPredict.class}{if more +than one block, returns the weighted predicted class over the blocks +(weighted average of the \code{Predict} output and prediction using the +\code{max.dist} distance). See details for more info on weights.} +\item{WeightedPredict.error.rate}{if more than one block, returns the +weighted average predicted error rate over the blocks (using the +\code{WeightedPredict.class} output.)} \item{MajorityVote}{if more than one +block, returns the majority class over the blocks. NA for a sample means that +there is no consensus on the predicted class for this particular sample over +the blocks.} \item{MajorityVote.error.rate}{if more than one block, returns +the error rate of the \code{MajorityVote} output} +\item{WeightedVote}{if more than one block, returns the weighted majority +class over the blocks. NA for a sample means that there is no consensus on +the predicted class for this particular sample over the blocks.} +\item{WeightedVote.error.rate}{if more than one block, returns the error +rate of the \code{WeightedVote} output} \item{weights}{Returns the weights +of each block used for the weighted predictions, for each nrepeat and each +fold} \item{choice.ncomp}{For supervised models; returns the optimal number +of components for the model for each prediction distance using one-sided +t-tests that test for a significant difference in the mean error rate (gain +in prediction) when components are added to the model. See more details in +Rohart et al 2017 Suppl. For more than one block, an optimal ncomp is +returned for each prediction framework.} } \description{ Computes M-fold or Leave-One-Out Cross-Validation scores based on a -user-input grid to determine the optimal parsity parameters values for +user-input grid to determine the optimal parameters for method \code{block.splsda}. } \details{ -This tuning function should be used to tune the keepX parameters in the -\code{block.splsda} function (N-integration with sparse Discriminant +This tuning function should be used to tune the number of components and the +keepX parameters in the \code{block.splsda} function (N-integration with sparse Discriminant Analysis). M-fold or LOO cross-validation is performed with stratified subsampling @@ -172,64 +210,70 @@ class, weighted by the number of samples in each class. BER is less biased towards majority classes during the performance assessment. } \examples{ -\dontrun{ +## Set up data + +# load data data("breast.TCGA") -# this is the X data as a list of mRNA and miRNA; the Y data set is a single data set of proteins -data = list(mrna = breast.TCGA$data.train$mrna, mirna = breast.TCGA$data.train$mirna, - protein = breast.TCGA$data.train$protein) + +# X data - list of mRNA and miRNA +X <- list(mrna = breast.TCGA$data.train$mrna, mirna = breast.TCGA$data.train$mirna, + protein = breast.TCGA$data.train$protein) + +# Y data - single data set of proteins +Y <- breast.TCGA$data.train$subtype + +# subset the X and Y data to speed up computation in this example +set.seed(100) +subset <- mixOmics:::stratified.subsampling(breast.TCGA$data.train$subtype, folds = 3)[[1]][[1]] +X <- lapply(X, function(omic) omic[subset,]) +Y <- Y[subset] + # set up a full design where every block is connected # could also consider other weights, see our mixOmics manuscript design = matrix(1, ncol = length(data), nrow = length(data), dimnames = list(names(data), names(data))) diag(design) = 0 design -# set number of component per data set -ncomp = 3 -# Tuning the first two components -# ------------- -## Not run: +## Tune number of components to keep +tune_res <- tune.block.splsda(X, Y, design = design, + ncomp = 5, + test.keepX = NULL, + nrepeat = 3, + seed = 13) + +plot(tune_res) + +tune_res$choice.ncomp # 3 components best + +## Tune number of variables to keep + # definition of the keepX value to be tested for each block mRNA miRNA and protein # names of test.keepX must match the names of 'data' test.keepX = list(mrna = c(10, 30), mirna = c(15, 25), protein = c(4, 8)) -# the following may take some time to run, so we subset the data first. -# Note that for thorough tuning, nrepeat should be >= 3 so that significance of -# the model improvement can be measured -## ---- subset by 3rd of samples -set.seed(100) -subset <- mixOmics:::stratified.subsampling(breast.TCGA$data.train$subtype, folds = 3)[[1]][[1]] -data <- lapply(data, function(omic) omic[subset,]) -Y <- breast.TCGA$data.train$subtype[subset] -## ---- run -# Check if the environment variable exists (during R CMD check) and limit cores accordingly -max_cores <- if (Sys.getenv("_R_CHECK_LIMIT_CORES_") != "") 2 else parallel::detectCores() - 1 -# Setup the parallel backend with the appropriate number of workers -BPPARAM <- BiocParallel::MulticoreParam(workers = max_cores) - -tune <- tune.block.splsda( - X = data, - Y = Y, - ncomp = ncomp, - test.keepX = test.keepX, - design = design, - nrepeat = 2, - BPPARAM = BPPARAM -) +# load parallel package +library(BiocParallel) -plot(tune) -tune$choice.ncomp -tune$choice.keepX +# run tuning in parallel on 2 cores +tune_res <- tune.block.splsda(X, Y, design = design, + ncomp = 2, + test.keepX = test.keepX, + nrepeat = 3, + seed = 13, + BPPARAM = SnowParam(workers = 2)) + +plot(tune_res) +tune_res$choice.keepX # Now tuning a new component given previous tuned keepX -already.tested.X = tune$choice.keepX -tune = tune.block.splsda(X = data, Y = Y, - ncomp = 4, test.keepX = test.keepX, design = design, +already.tested.X <- tune_res$choice.keepX +tune_res <- tune.block.splsda(X, Y, design = design, + ncomp = 3, + test.keepX = test.keepX, already.tested.X = already.tested.X, - BPPARAM = BPPARAM - ) -tune$choice.keepX -} + BPPARAM = SnowParam(workers = 2)) +tune_res$choice.keepX } \references{ Method: From 645d96227243b2bd142f4b08bd96b96e03d64e72 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Mon, 25 Nov 2024 18:12:48 +1100 Subject: [PATCH 16/75] typos in tune.block.splsda --- R/tune.block.splsda.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/tune.block.splsda.R b/R/tune.block.splsda.R index bbabae2c..978579ce 100644 --- a/R/tune.block.splsda.R +++ b/R/tune.block.splsda.R @@ -298,7 +298,7 @@ tune.block.splsda <- #-- test.keepX if (is.null(test.keepX)){ print("test.keepX is set to NULL, tuning only for number of components...") - block.splsda_res <- block.splsda(X, Y, ncomp = ncomp, ncomp, keepX = ncol(X), + block.splsda_res <- block.splsda(X, Y, ncomp = ncomp, keepX = setNames(as.list(rep(nrow(X[[1]]), length(names(X)))), names(X)), scale = scale, tol = tol, max.iter = max.iter, near.zero.var = near.zero.var, design = design) perf_res <- perf(block.splsda_res, validation = validation, folds = folds, nrepeat = nrepeat, @@ -307,6 +307,7 @@ tune.block.splsda <- return(perf_res) } else { + if (missing(test.keepX)) { From cf968246d57328e46c45dc48f23c170172839cdc Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Mon, 25 Nov 2024 18:12:59 +1100 Subject: [PATCH 17/75] unit testing for new tune.block.splsda functionality --- tests/testthat/test-tune.block.splsda.R | 43 ++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/tests/testthat/test-tune.block.splsda.R b/tests/testthat/test-tune.block.splsda.R index 7e9b81bd..b1da06e2 100644 --- a/tests/testthat/test-tune.block.splsda.R +++ b/tests/testthat/test-tune.block.splsda.R @@ -157,4 +157,45 @@ test_that("tune.block.splsda works independently and in tune wrapper the same", BPPARAM = SerialParam(), seed = 42 ) expect_equal(tune11$choice.keepX, tune41$choice.keepX) -}) \ No newline at end of file +}) + +test_that("tune.block.splsda works when test.keepX = NULL and gives same result as perf()", code = { + + data("breast.TCGA") + data = list( + mrna = breast.TCGA$data.train$mrna, + mirna = breast.TCGA$data.train$mirna, + protein = breast.TCGA$data.train$protein + ) + + design = 'full' + ncomp <- 2 + folds <- 3 + nrep <- 3 + test.keepX = list( + mrna = c(10, 20), + mirna = c(20, 30), + protein = c(3, 6) + ) + + set.seed(100) + subset <- mixOmics:::stratified.subsampling(breast.TCGA$data.train$subtype, folds = 4)[[1]][[1]] + X <- lapply(data, function(omic) omic[subset,]) + Y <- breast.TCGA$data.train$subtype[subset] + + # tune on components only + tune_res <- suppressWarnings( + tune.block.splsda(X, Y, ncomp = 3, design = design, + nrepeat = 1, folds = 3, + test.keepX = NULL, seed = 20) + ) + + # run perf + block.splsda_res <- block.splsda(X, Y, ncomp = 3, design = design) + perf_res <- suppressWarnings( + perf(block.splsda_res, ncomp = 3, nrepeat = 1, folds = 3, dist = "max.dist", seed = 20) + ) + + # check results match + expect_equal(tune_res$error.rate$mrna[1,1], perf_res$error.rate$mrna[1,1]) +}) From 6ad679ecd22bc1b5a176848d9d3789c17d86f558 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Mon, 25 Nov 2024 18:24:28 +1100 Subject: [PATCH 18/75] init tune.block.plsda --- NAMESPACE | 1 + R/tune.block.plsda.R | 245 ++++++++++++++++++++++++++++++++++++++++ man/tune.block.plsda.Rd | 196 ++++++++++++++++++++++++++++++++ 3 files changed, 442 insertions(+) create mode 100644 R/tune.block.plsda.R create mode 100644 man/tune.block.plsda.Rd diff --git a/NAMESPACE b/NAMESPACE index bec3f7ec..8391cdc6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -169,6 +169,7 @@ export(spls) export(splsda) export(study_split) export(tune) +export(tune.block.plsda) export(tune.block.splsda) export(tune.mint.splsda) export(tune.pca) diff --git a/R/tune.block.plsda.R b/R/tune.block.plsda.R new file mode 100644 index 00000000..230dd18a --- /dev/null +++ b/R/tune.block.plsda.R @@ -0,0 +1,245 @@ +# ======================================================================================================== +# tune.block.plsda: Tuning hyperparameters on a block plsda method +# ======================================================================================================== +#' Tuning function for block.plsda method (N-integration with Discriminant Analysis) +#' +#' Computes M-fold or Leave-One-Out Cross-Validation scores based on a +#' user-input grid to determine the optimal parameters for +#' method \code{block.plsda}. +#' +#' This tuning function should be used to tune the number of components in the \code{block.plsda} function (N-integration with Discriminant Analysis). +#' +#' M-fold or LOO cross-validation is performed with stratified subsampling +#' where all classes are represented in each fold. +#' +#' If \code{validation = "Mfold"}, M-fold cross-validation is performed. The +#' number of folds to generate is to be specified in the argument \code{folds}. +#' +#' If \code{validation = "loo"}, leave-one-out cross-validation is performed. +#' By default \code{folds} is set to the number of unique individuals. +#' +#' More details about the prediction distances in \code{?predict} and the +#' supplemental material of the mixOmics article (Rohart et al. 2017). Details +#' about the PLS modes are in \code{?pls}. +#' +#' BER is appropriate in case of an unbalanced number of samples per class as +#' it calculates the average proportion of wrongly classified samples in each +#' class, weighted by the number of samples in each class. BER is less biased +#' towards majority classes during the performance assessment. +#' +#' @inheritParams block.plsda +#' @inheritParams tune +#' @param weighted tune using either the performance of the Majority vote or +#' the Weighted vote. +#' @param signif.threshold numeric between 0 and 1 indicating the significance +#' threshold required for improvement in error rate of the components. Default +#' to 0.01. +#' @param ... Optional arguments: +#' \itemize{ +#' \item \bold{seed} Integer. Seed number for reproducible parallel code. +#' Default is \code{NULL}. +#' } +#' run in parallel when repeating the cross-validation, which is usually the +#' most computationally intensive process. If there is excess CPU, the +#' cross-vaidation is also parallelised on *nix-based OS which support +#' \code{mclapply}. +#' Note that the argument 'scheme' has now been hardcoded to 'horst' and 'init' to 'svd.single'. +#' @return returns: +#' \item{error.rate}{Prediction error rate for each block of \code{object$X} +#' and each \code{dist}} \item{error.rate.per.class}{Prediction error rate for +#' each block of \code{object$X}, each \code{dist} and each class} +#' \item{predict}{Predicted values of each sample for each class, each block +#' and each component} \item{class}{Predicted class of each sample for each +#' block, each \code{dist}, each component and each nrepeat} \item{features}{a +#' list of features selected across the folds (\code{$stable.X} and +#' \code{$stable.Y}) for the \code{keepX} and \code{keepY} parameters from the +#' input object.} \item{AveragedPredict.class}{if more than one block, returns +#' the average predicted class over the blocks (averaged of the \code{Predict} +#' output and prediction using the \code{max.dist} distance)} +#' \item{AveragedPredict.error.rate}{if more than one block, returns the +#' average predicted error rate over the blocks (using the +#' \code{AveragedPredict.class} output)} \item{WeightedPredict.class}{if more +#' than one block, returns the weighted predicted class over the blocks +#' (weighted average of the \code{Predict} output and prediction using the +#' \code{max.dist} distance). See details for more info on weights.} +#' \item{WeightedPredict.error.rate}{if more than one block, returns the +#' weighted average predicted error rate over the blocks (using the +#' \code{WeightedPredict.class} output.)} \item{MajorityVote}{if more than one +#' block, returns the majority class over the blocks. NA for a sample means that +#' there is no consensus on the predicted class for this particular sample over +#' the blocks.} \item{MajorityVote.error.rate}{if more than one block, returns +#' the error rate of the \code{MajorityVote} output} +#' \item{WeightedVote}{if more than one block, returns the weighted majority +#' class over the blocks. NA for a sample means that there is no consensus on +#' the predicted class for this particular sample over the blocks.} +#' \item{WeightedVote.error.rate}{if more than one block, returns the error +#' rate of the \code{WeightedVote} output} \item{weights}{Returns the weights +#' of each block used for the weighted predictions, for each nrepeat and each +#' fold} \item{choice.ncomp}{For supervised models; returns the optimal number +#' of components for the model for each prediction distance using one-sided +#' t-tests that test for a significant difference in the mean error rate (gain +#' in prediction) when components are added to the model. See more details in +#' Rohart et al 2017 Suppl. For more than one block, an optimal ncomp is +#' returned for each prediction framework.} +#' +#' @author Florian Rohart, Amrit Singh, Kim-Anh Lê Cao, AL J Abadi +#' @seealso \code{\link{block.splsda}} and http://www.mixOmics.org for more +#' details. +#' @references Method: +#' +#' Singh A., Gautier B., Shannon C., Vacher M., Rohart F., Tebbutt S. and Lê +#' Cao K.A. (2016). DIABLO: multi omics integration for biomarker discovery. +#' +#' mixOmics article: +#' +#' Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics +#' feature selection and multiple data integration. PLoS Comput Biol 13(11): +#' e1005752 +#' @keywords regression multivariate +#' @export +#' @example ./examples/tune.block.plsda-examples.R +tune.block.plsda <- + function ( + # basic params + X, + Y, + indY, + ncomp = 2, + # model building params + tol = 1e-06, + max.iter = 100, + near.zero.var = FALSE, + design, + scale = TRUE, + # cross validation params + validation = "Mfold", + folds = 10, + nrepeat = 1, + signif.threshold=0.01, + # measure of performance params + dist = "max.dist", + measure = "BER", # one of c("overall","BER") + weighted = TRUE, # optimise the weighted or not-weighted prediction + # processing params + progressBar = FALSE, + light.output = TRUE, # if FALSE, output the prediction and classification of each sample during each folds, on each comp, for each repeat + BPPARAM = SerialParam(), + seed = NULL, + ...) + { + if (hasArg('cpus')) #defunct + { + stop("'cpus' has been replaced by BPPARAM. See documentation.") + } + BPPARAM$RNGseed <- seed + set.seed(seed) + + # hardcode init and scheme + scheme <- 'horst' + init <- 'svd.single' + + ## ----------- checks ----------- + + # check input 'Y' and transformation in a dummy matrix + if (!missing(Y)) + { + if (is.null(dim(Y))) + { + Y = factor(Y) + } else { + stop("'Y' should be a factor or a class vector.") + } + + if (nlevels(Y) == 1) + stop("'Y' should be a factor with more than one level") + + } else if (!missing(indY)) { + Y = X[[indY]] + if (is.null(dim(Y))) + { + Y = factor(Y) + } else { + stop("'Y' should be a factor or a class vector.") + } + + if (nlevels(Y) == 1) + stop("'X[[indY]]' should be a factor with more than one level") + + X = X[-indY] #remove Y from X to pass the arguments simpler to block.splsda + + } else if (missing(indY)) { + stop("Either 'Y' or 'indY' is needed") + + } + ## check using internal #TODO we need to unify the checks + Y.check <- unmap(Y) + Y.check <- matrix(Y.check, nrow = nrow(Y.check), dimnames = list(rownames(X[[1]]), NULL)) + Check.entry.wrapper.mint.block(X = X, Y = Y.check, indY = indY, + ncomp = ncomp, DA=TRUE, + design = design, init = init, scheme = scheme, scale = scale, + near.zero.var = near.zero.var, mode = 'regression', tol = tol, + max.iter = max.iter) + + ## ensure all X blocks are matrices, keeping dimnames + X <- lapply(X, function(z){ + zm <- z + if (!is.matrix(zm)) { + zm <- as.matrix(zm) + dimnames(zm) <- dimnames(z) + } + return(zm) + }) + + #-- dist + dist = match.arg( + dist, + choices = c("max.dist", "centroids.dist", "mahalanobis.dist"), + several.ok = TRUE + ) + + #-- progressBar + if (!is.logical(progressBar)) + stop("'progressBar' must be a logical constant (TRUE or FALSE).", + call. = FALSE) + + #-- ncomp + if (is.null(ncomp) || !is.numeric(ncomp) || ncomp <= 0) + stop("invalid number of variates, 'ncomp'.") + + + #-- validation + choices = c("Mfold", "loo") + validation = choices[pmatch(validation, choices)] + if (is.na(validation)) + stop("'validation' must be either 'Mfold' or 'loo'") + + if (validation == 'loo') + { + if (nrepeat != 1) + message("Leave-One-Out validation does not need to be repeated: 'nrepeat' is set to '1'.") + nrepeat = 1 + } + + #-- measure + measure.input = measure + if (!measure %in% c("overall", "BER")) + stop("'measure' must be 'overall' or 'BER'") + + #-- check significance threshold + signif.threshold <- .check_alpha(signif.threshold) + + #-- validation + if (any(is.na(validation)) || length(validation) > 1) + stop("'validation' should be one of 'Mfold' or 'loo'.", call. = FALSE) + + ## ----------- Cross-validation ----------- + + block.plsda_res <- block.plsda(X, Y, ncomp = ncomp, + scale = scale, tol = tol, max.iter = max.iter, near.zero.var = near.zero.var, design = design) + perf_res <- perf(block.plsda_res, + validation = validation, folds = folds, nrepeat = nrepeat, + dist = dist, + BPPARAM = BPPARAM, seed = seed, progressBar = progressBar) + return(perf_res) + +} \ No newline at end of file diff --git a/man/tune.block.plsda.Rd b/man/tune.block.plsda.Rd new file mode 100644 index 00000000..a4e652d9 --- /dev/null +++ b/man/tune.block.plsda.Rd @@ -0,0 +1,196 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tune.block.plsda.R +\name{tune.block.plsda} +\alias{tune.block.plsda} +\title{Tuning function for block.plsda method (N-integration with Discriminant Analysis)} +\usage{ +tune.block.plsda( + X, + Y, + indY, + ncomp = 2, + tol = 1e-06, + max.iter = 100, + near.zero.var = FALSE, + design, + scale = TRUE, + validation = "Mfold", + folds = 10, + nrepeat = 1, + signif.threshold = 0.01, + dist = "max.dist", + measure = "BER", + weighted = TRUE, + progressBar = FALSE, + light.output = TRUE, + BPPARAM = SerialParam(), + seed = NULL, + ... +) +} +\arguments{ +\item{X}{A named list of data sets (called 'blocks') measured on the same +samples. Data in the list should be arranged in matrices, samples x variables, +with samples order matching in all data sets.} + +\item{Y}{a factor or a class vector for the discrete outcome.} + +\item{indY}{To supply if \code{Y} is missing, indicates the position of +the matrix response in the list \code{X}.} + +\item{ncomp}{the number of components to include in the model. Default to 2. +Applies to all blocks.} + +\item{tol}{Positive numeric used as convergence criteria/tolerance during the +iterative process. Default to \code{1e-06}.} + +\item{max.iter}{Integer, the maximum number of iterations. Default to 100.} + +\item{near.zero.var}{Logical, see the internal \code{\link{nearZeroVar}} +function (should be set to TRUE in particular for data with many zero +values). Setting this argument to FALSE (when appropriate) will speed up the +computations. Default value is FALSE.} + +\item{design}{numeric matrix of size (number of blocks in X) x (number of +blocks in X) with values between 0 and 1. Each value indicates the strenght +of the relationship to be modelled between two blocks; a value of 0 +indicates no relationship, 1 is the maximum value. Alternatively, one of +c('null', 'full') indicating a disconnected or fully connected design, +respecively, or a numeric between 0 and 1 which will designate all +off-diagonal elements of a fully connected design (see examples in +\code{block.splsda}). If \code{Y} is provided instead of \code{indY}, the +\code{design} matrix is changed to include relationships to \code{Y}.} + +\item{scale}{Logical. If scale = TRUE, each block is standardized to zero +means and unit variances (default: TRUE)} + +\item{validation}{character. What kind of (internal) validation to use, +matching one of \code{"Mfold"} or \code{"loo"} (see below). Default is +\code{"Mfold"}.} + +\item{folds}{the folds in the Mfold cross-validation. See Details.} + +\item{nrepeat}{Number of times the Cross-Validation process is repeated.} + +\item{signif.threshold}{numeric between 0 and 1 indicating the significance +threshold required for improvement in error rate of the components. Default +to 0.01.} + +\item{dist}{distance metric to estimate the +classification error rate, should be a subset of \code{"centroids.dist"}, +\code{"mahalanobis.dist"} or \code{"max.dist"} (see Details).} + +\item{measure}{The tuning measure used for different methods. See details.} + +\item{weighted}{tune using either the performance of the Majority vote or +the Weighted vote.} + +\item{progressBar}{by default set to \code{TRUE} to output the progress bar +of the computation.} + +\item{light.output}{if set to FALSE, the prediction/classification of each +sample for each of \code{test.keepX} and each comp is returned.} + +\item{BPPARAM}{A \linkS4class{BiocParallelParam} object indicating the type +of parallelisation. See examples.} + +\item{seed}{set a number here if you want the function to give reproducible outputs. +Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'.} + +\item{...}{Optional arguments: +\itemize{ + \item \bold{seed} Integer. Seed number for reproducible parallel code. + Default is \code{NULL}. +} +run in parallel when repeating the cross-validation, which is usually the +most computationally intensive process. If there is excess CPU, the +cross-vaidation is also parallelised on *nix-based OS which support +\code{mclapply}. +Note that the argument 'scheme' has now been hardcoded to 'horst' and 'init' to 'svd.single'.} +} +\value{ +returns: +\item{error.rate}{Prediction error rate for each block of \code{object$X} +and each \code{dist}} \item{error.rate.per.class}{Prediction error rate for +each block of \code{object$X}, each \code{dist} and each class} +\item{predict}{Predicted values of each sample for each class, each block +and each component} \item{class}{Predicted class of each sample for each +block, each \code{dist}, each component and each nrepeat} \item{features}{a +list of features selected across the folds (\code{$stable.X} and +\code{$stable.Y}) for the \code{keepX} and \code{keepY} parameters from the +input object.} \item{AveragedPredict.class}{if more than one block, returns +the average predicted class over the blocks (averaged of the \code{Predict} +output and prediction using the \code{max.dist} distance)} +\item{AveragedPredict.error.rate}{if more than one block, returns the +average predicted error rate over the blocks (using the +\code{AveragedPredict.class} output)} \item{WeightedPredict.class}{if more +than one block, returns the weighted predicted class over the blocks +(weighted average of the \code{Predict} output and prediction using the +\code{max.dist} distance). See details for more info on weights.} +\item{WeightedPredict.error.rate}{if more than one block, returns the +weighted average predicted error rate over the blocks (using the +\code{WeightedPredict.class} output.)} \item{MajorityVote}{if more than one +block, returns the majority class over the blocks. NA for a sample means that +there is no consensus on the predicted class for this particular sample over +the blocks.} \item{MajorityVote.error.rate}{if more than one block, returns +the error rate of the \code{MajorityVote} output} +\item{WeightedVote}{if more than one block, returns the weighted majority +class over the blocks. NA for a sample means that there is no consensus on +the predicted class for this particular sample over the blocks.} +\item{WeightedVote.error.rate}{if more than one block, returns the error +rate of the \code{WeightedVote} output} \item{weights}{Returns the weights +of each block used for the weighted predictions, for each nrepeat and each +fold} \item{choice.ncomp}{For supervised models; returns the optimal number +of components for the model for each prediction distance using one-sided +t-tests that test for a significant difference in the mean error rate (gain +in prediction) when components are added to the model. See more details in +Rohart et al 2017 Suppl. For more than one block, an optimal ncomp is +returned for each prediction framework.} +} +\description{ +Computes M-fold or Leave-One-Out Cross-Validation scores based on a +user-input grid to determine the optimal parameters for +method \code{block.plsda}. +} +\details{ +This tuning function should be used to tune the number of components in the \code{block.plsda} function (N-integration with Discriminant Analysis). + +M-fold or LOO cross-validation is performed with stratified subsampling +where all classes are represented in each fold. + +If \code{validation = "Mfold"}, M-fold cross-validation is performed. The +number of folds to generate is to be specified in the argument \code{folds}. + +If \code{validation = "loo"}, leave-one-out cross-validation is performed. +By default \code{folds} is set to the number of unique individuals. + +More details about the prediction distances in \code{?predict} and the +supplemental material of the mixOmics article (Rohart et al. 2017). Details +about the PLS modes are in \code{?pls}. + +BER is appropriate in case of an unbalanced number of samples per class as +it calculates the average proportion of wrongly classified samples in each +class, weighted by the number of samples in each class. BER is less biased +towards majority classes during the performance assessment. +} +\references{ +Method: + +Singh A., Gautier B., Shannon C., Vacher M., Rohart F., Tebbutt S. and Lê +Cao K.A. (2016). DIABLO: multi omics integration for biomarker discovery. + +mixOmics article: + +Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics +feature selection and multiple data integration. PLoS Comput Biol 13(11): +e1005752 +} +\seealso{ +\code{\link{block.splsda}} and http://www.mixOmics.org for more +details. +} +\author{ +Florian Rohart, Amrit Singh, Kim-Anh Lê Cao, AL J Abadi +} +\keyword{multivariate} +\keyword{regression} From 9c2cd77f1c964a8d8566956529ca84f33ee2838d Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Mon, 25 Nov 2024 18:25:36 +1100 Subject: [PATCH 19/75] add example for tune.block.plsda --- examples/tune.block.plsda-examples.R | 47 ++++++++++++++++++++++++++ man/tune.block.plsda.Rd | 49 ++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) create mode 100644 examples/tune.block.plsda-examples.R diff --git a/examples/tune.block.plsda-examples.R b/examples/tune.block.plsda-examples.R new file mode 100644 index 00000000..6dae25d5 --- /dev/null +++ b/examples/tune.block.plsda-examples.R @@ -0,0 +1,47 @@ +## Set up data + +# load data +data("breast.TCGA") + +# X data - list of mRNA and miRNA +X <- list(mrna = breast.TCGA$data.train$mrna, mirna = breast.TCGA$data.train$mirna, + protein = breast.TCGA$data.train$protein) + +# Y data - single data set of proteins +Y <- breast.TCGA$data.train$subtype + +# subset the X and Y data to speed up computation in this example +set.seed(100) +subset <- mixOmics:::stratified.subsampling(breast.TCGA$data.train$subtype, folds = 3)[[1]][[1]] +X <- lapply(X, function(omic) omic[subset,]) +Y <- Y[subset] + +# set up a full design where every block is connected +# could also consider other weights, see our mixOmics manuscript +design = matrix(1, ncol = length(data), nrow = length(data), + dimnames = list(names(data), names(data))) +diag(design) = 0 +design + +## Tune number of components to keep +tune_res <- tune.block.plsda(X, Y, design = design, + ncomp = 5, + nrepeat = 3, + seed = 13, + dist = c("max.dist", "centroids.dist")) + +plot(tune_res) + +tune_res$choice.ncomp # 3 components best for max.dist, 1 for centroids.dist + + +## Tune number of components to keep - use weighted vote rather than majority vote +tune_res <- tune.block.plsda(X, Y, design = design, + ncomp = 5, + nrepeat = 3, + seed = 13, + dist = c("max.dist", "centroids.dist"), + weighted = FALSE) +plot(tune_res) + +tune_res$weights \ No newline at end of file diff --git a/man/tune.block.plsda.Rd b/man/tune.block.plsda.Rd index a4e652d9..1c7c5533 100644 --- a/man/tune.block.plsda.Rd +++ b/man/tune.block.plsda.Rd @@ -173,6 +173,55 @@ it calculates the average proportion of wrongly classified samples in each class, weighted by the number of samples in each class. BER is less biased towards majority classes during the performance assessment. } +\examples{ +## Set up data + +# load data +data("breast.TCGA") + +# X data - list of mRNA and miRNA +X <- list(mrna = breast.TCGA$data.train$mrna, mirna = breast.TCGA$data.train$mirna, + protein = breast.TCGA$data.train$protein) + +# Y data - single data set of proteins +Y <- breast.TCGA$data.train$subtype + +# subset the X and Y data to speed up computation in this example +set.seed(100) +subset <- mixOmics:::stratified.subsampling(breast.TCGA$data.train$subtype, folds = 3)[[1]][[1]] +X <- lapply(X, function(omic) omic[subset,]) +Y <- Y[subset] + +# set up a full design where every block is connected +# could also consider other weights, see our mixOmics manuscript +design = matrix(1, ncol = length(data), nrow = length(data), + dimnames = list(names(data), names(data))) +diag(design) = 0 +design + +## Tune number of components to keep +tune_res <- tune.block.plsda(X, Y, design = design, + ncomp = 5, + nrepeat = 3, + seed = 13, + dist = c("max.dist", "centroids.dist")) + +plot(tune_res) + +tune_res$choice.ncomp # 3 components best for max.dist, 1 for centroids.dist + + +## Tune number of components to keep - use weighted vote rather than majority vote +tune_res <- tune.block.plsda(X, Y, design = design, + ncomp = 5, + nrepeat = 3, + seed = 13, + dist = c("max.dist", "centroids.dist"), + weighted = FALSE) +plot(tune_res) + +tune_res$weights +} \references{ Method: From 73c89c51d63f666e5ebae88619a0fade6b763aa6 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Mon, 25 Nov 2024 18:50:26 +1100 Subject: [PATCH 20/75] update tune.mint.splsda docs and example --- R/tune.mint.splsda.R | 71 +++++++++++++--------------- examples/tune.mint.splsda-examples.R | 30 ++++++++++++ man/tune.mint.splsda.Rd | 52 +++++++------------- 3 files changed, 82 insertions(+), 71 deletions(-) create mode 100644 examples/tune.mint.splsda-examples.R diff --git a/R/tune.mint.splsda.R b/R/tune.mint.splsda.R index 26316c0c..3235c479 100644 --- a/R/tune.mint.splsda.R +++ b/R/tune.mint.splsda.R @@ -4,17 +4,14 @@ #' Estimate the parameters of mint.splsda method #' #' Computes Leave-One-Group-Out-Cross-Validation (LOGOCV) scores on a -#' user-input grid to determine optimal values for the sparsity parameters in +#' user-input grid to determine optimal values for the parameters in #' \code{mint.splsda}. #' #' This function performs a Leave-One-Group-Out-Cross-Validation (LOGOCV), -#' where each of \code{study} is left out once. It returns a list of variables -#' of \code{X} that were selected on each of the \code{ncomp} components. Then, -#' a \code{\link{mint.splsda}} can be performed with \code{keepX} set as the -#' output \code{choice.keepX}. +#' where each of \code{study} is left out once. #' -#' All component \eqn{1:\code{ncomp}} are tuned, except the first ones for -#' which a \code{already.tested.X} is provided. See examples below. +#' When \code{test.keepX} is not NULL, all component \eqn{1:\code{ncomp}} are tuned to identify number of variables to keep, +#' except the first ones for which a \code{already.tested.X} is provided. See examples below. #' #' The function outputs the optimal number of components that achieve the best #' performance based on the overall error rate or BER. The assessment is @@ -41,7 +38,7 @@ #' @param study grouping factor indicating which samples are from the same #' study #' @param test.keepX numeric vector for the different number of variables to -#' test from the \eqn{X} data set +#' test from the \eqn{X} data set. If set to NULL only number of components will be tuned. #' @param already.tested.X if \code{ncomp > 1} Numeric vector indicating the #' number of variables to select from the \eqn{X} data set on the firsts #' components @@ -81,6 +78,18 @@ #' \item{predict}{Prediction values for each sample, each \code{test.keepX} and #' each comp.} \item{class}{Predicted class for each sample, each #' \code{test.keepX} and each comp.} +#' +#' If \code{test.keepX = NULL}, returns: +#' \item{study.specific.error}{A list that gives BER, overall error rate and +#' error rate per class, for each study} \item{global.error}{A list that gives +#' BER, overall error rate and error rate per class for all samples} +#' \item{predict}{A list of length \code{ncomp} that produces the predicted +#' values of each sample for each class} \item{class}{A list which gives the +#' predicted class of each sample for each \code{dist} and each of the +#' \code{ncomp} components. Directly obtained from the \code{predict} output.} +#' \item{auc}{AUC values} \item{auc.study}{AUC values for each study in mint +#' models}. +#' #' @author Florian Rohart, Al J Abadi #' @seealso \code{\link{mint.splsda}} and http://www.mixOmics.org for more #' details. @@ -96,38 +105,14 @@ #' e1005752 #' @keywords multivariate dplot #' @export -#' @examples -#' data(stemcells) -#' data = stemcells$gene -#' type.id = stemcells$celltype -#' exp = stemcells$study -#' -#' res = mint.splsda(X=data,Y=type.id,ncomp=3,keepX=c(10,5,15),study=exp) -#' out = tune.mint.splsda(X=data,Y=type.id,ncomp=2,near.zero.var=FALSE, -#' study=exp,test.keepX=seq(1,10,1)) -#' -#' out$choice.ncomp -#' out$choice.keepX -#' -#' \dontrun{ -#' -#' out = tune.mint.splsda(X=data,Y=type.id,ncomp=2,near.zero.var=FALSE, -#' study=exp,test.keepX=seq(1,10,1)) -#' out$choice.keepX -#' -#' ## only tune component 2 and keeping 10 genes on comp1 -#' out = tune.mint.splsda(X=data,Y=type.id,ncomp=2, study=exp, -#' already.tested.X = c(10), -#' test.keepX=seq(1,10,1)) -#' out$choice.keepX -#' -#' } +#' @examples ./examples/tune.mint.splsda-examples.R + tune.mint.splsda <- function (X, Y, ncomp = 1, study, - test.keepX = c(5, 10, 15), + test.keepX = NULL, already.tested.X, dist = c("max.dist", "centroids.dist", "mahalanobis.dist"), measure = c("BER", "overall"), @@ -138,7 +123,7 @@ tune.mint.splsda <- max.iter = 100, near.zero.var = FALSE, light.output = TRUE, # if FALSE, output the prediction and classification of each sample during each folds, on each comp, for each repeat - signif.threshold=0.01 + signif.threshold = 0.01 ) { #-- checking general input parameters --------------------------------------# @@ -238,7 +223,18 @@ tune.mint.splsda <- if (!is.logical(light.output)) stop("'light.output' must be either TRUE or FALSE", call. = FALSE) - #-- test.keepX + #-- test.keepX, if null just tune on ncomp + if (is.null(test.keepX)){ + print("test.keepX is set to NULL, tuning only for number of components...") + mint.splsda_res <- mint.splsda(X, Y, ncomp = ncomp, study = study, + scale = scale, tol = tol, max.iter = max.iter, near.zero.var = near.zero.var) + perf_res <- perf(mint.splsda_res, + validation = validation, folds = folds, nrepeat = nrepeat, + dist = dist, + BPPARAM = BPPARAM, seed = seed, progressBar = progressBar) + return(perf_res) + + } else { if (is.null(test.keepX) | length(test.keepX) == 1 | !is.numeric(test.keepX)) stop("'test.keepX' must be a numeric vector with more than two entries", call. = FALSE) @@ -362,4 +358,5 @@ tune.mint.splsda <- class(result) = c("tune.mint.splsda","tune.splsda") return(result) + } } diff --git a/examples/tune.mint.splsda-examples.R b/examples/tune.mint.splsda-examples.R new file mode 100644 index 00000000..c8d10a6c --- /dev/null +++ b/examples/tune.mint.splsda-examples.R @@ -0,0 +1,30 @@ +# set up data +data(stemcells) +data <- stemcells$gene +type.id <- stemcells$celltype +exp <- stemcells$study + +# tune number of components +tune_res <- tune.mint.splsda(X = data,Y = type.id, ncomp=5, + near.zero.var=FALSE, + study=exp, + test.keepX = NULL) + +plot(tune_res) +tune_res$choice.ncomp # 1 component + +## tune number of variables to keep +tune_res <- tune.mint.splsda(X = data,Y = type.id, ncomp = 1, + near.zero.var = FALSE, + study=exp, + test.keepX=seq(1,10,1)) + +plot(tune_res) +tune_res$choice.keepX # 9 variables to keep on component 1 + +## only tune component 3 and keeping 10 genes on comp1 +tune_res <- tune.mint.splsda(X = data, Y = type.id, ncomp = 2, study = exp, + already.tested.X = c(9), + test.keepX = seq(1,10,1)) +plot(tune_res) +tune_res$choice.keepX # 10 variables to keep on comp2 \ No newline at end of file diff --git a/man/tune.mint.splsda.Rd b/man/tune.mint.splsda.Rd index 05896dd3..c9207558 100644 --- a/man/tune.mint.splsda.Rd +++ b/man/tune.mint.splsda.Rd @@ -9,7 +9,7 @@ tune.mint.splsda( Y, ncomp = 1, study, - test.keepX = c(5, 10, 15), + test.keepX = NULL, already.tested.X, dist = c("max.dist", "centroids.dist", "mahalanobis.dist"), measure = c("BER", "overall"), @@ -36,7 +36,7 @@ Default to 1} study} \item{test.keepX}{numeric vector for the different number of variables to -test from the \eqn{X} data set} +test from the \eqn{X} data set. If set to NULL only number of components will be tuned.} \item{already.tested.X}{if \code{ncomp > 1} Numeric vector indicating the number of variables to select from the \eqn{X} data set on the firsts @@ -89,21 +89,29 @@ and for each component computed with the optimal keepX} \item{predict}{Prediction values for each sample, each \code{test.keepX} and each comp.} \item{class}{Predicted class for each sample, each \code{test.keepX} and each comp.} + +If \code{test.keepX = NULL}, returns: +\item{study.specific.error}{A list that gives BER, overall error rate and +error rate per class, for each study} \item{global.error}{A list that gives +BER, overall error rate and error rate per class for all samples} +\item{predict}{A list of length \code{ncomp} that produces the predicted +values of each sample for each class} \item{class}{A list which gives the +predicted class of each sample for each \code{dist} and each of the +\code{ncomp} components. Directly obtained from the \code{predict} output.} +\item{auc}{AUC values} \item{auc.study}{AUC values for each study in mint +models}. } \description{ Computes Leave-One-Group-Out-Cross-Validation (LOGOCV) scores on a -user-input grid to determine optimal values for the sparsity parameters in +user-input grid to determine optimal values for the parameters in \code{mint.splsda}. } \details{ This function performs a Leave-One-Group-Out-Cross-Validation (LOGOCV), -where each of \code{study} is left out once. It returns a list of variables -of \code{X} that were selected on each of the \code{ncomp} components. Then, -a \code{\link{mint.splsda}} can be performed with \code{keepX} set as the -output \code{choice.keepX}. +where each of \code{study} is left out once. -All component \eqn{1:\code{ncomp}} are tuned, except the first ones for -which a \code{already.tested.X} is provided. See examples below. +When \code{test.keepX} is not NULL, all component \eqn{1:\code{ncomp}} are tuned to identify number of variables to keep, +except the first ones for which a \code{already.tested.X} is provided. See examples below. The function outputs the optimal number of components that achieve the best performance based on the overall error rate or BER. The assessment is @@ -123,31 +131,7 @@ More details about the prediction distances in \code{?predict} and the supplemental material of the mixOmics article (Rohart et al. 2017). } \examples{ -data(stemcells) -data = stemcells$gene -type.id = stemcells$celltype -exp = stemcells$study - -res = mint.splsda(X=data,Y=type.id,ncomp=3,keepX=c(10,5,15),study=exp) -out = tune.mint.splsda(X=data,Y=type.id,ncomp=2,near.zero.var=FALSE, -study=exp,test.keepX=seq(1,10,1)) - -out$choice.ncomp -out$choice.keepX - -\dontrun{ - -out = tune.mint.splsda(X=data,Y=type.id,ncomp=2,near.zero.var=FALSE, -study=exp,test.keepX=seq(1,10,1)) -out$choice.keepX - -## only tune component 2 and keeping 10 genes on comp1 -out = tune.mint.splsda(X=data,Y=type.id,ncomp=2, study=exp, -already.tested.X = c(10), -test.keepX=seq(1,10,1)) -out$choice.keepX - -} +./examples/tune.mint.splsda-examples.R } \references{ Rohart F, Eslami A, Matigian, N, Bougeard S, Lê Cao K-A (2017). From 1075f64d46d330541cc48d7bd2b483824ca53650 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Mon, 25 Nov 2024 19:00:48 +1100 Subject: [PATCH 21/75] add examples to docs --- R/tune.mint.splsda.R | 2 +- man/tune.mint.splsda.Rd | 31 ++++++++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/R/tune.mint.splsda.R b/R/tune.mint.splsda.R index 3235c479..0c9e137d 100644 --- a/R/tune.mint.splsda.R +++ b/R/tune.mint.splsda.R @@ -105,7 +105,7 @@ #' e1005752 #' @keywords multivariate dplot #' @export -#' @examples ./examples/tune.mint.splsda-examples.R +#' @example ./examples/tune.mint.splsda-examples.R tune.mint.splsda <- function (X, diff --git a/man/tune.mint.splsda.Rd b/man/tune.mint.splsda.Rd index c9207558..566b3e36 100644 --- a/man/tune.mint.splsda.Rd +++ b/man/tune.mint.splsda.Rd @@ -131,7 +131,36 @@ More details about the prediction distances in \code{?predict} and the supplemental material of the mixOmics article (Rohart et al. 2017). } \examples{ -./examples/tune.mint.splsda-examples.R +# set up data +data(stemcells) +data <- stemcells$gene +type.id <- stemcells$celltype +exp <- stemcells$study + +# tune number of components +tune_res <- tune.mint.splsda(X = data,Y = type.id, ncomp=5, + near.zero.var=FALSE, + study=exp, + test.keepX = NULL) + +plot(tune_res) +tune_res$choice.ncomp # 1 component + +## tune number of variables to keep +tune_res <- tune.mint.splsda(X = data,Y = type.id, ncomp = 1, + near.zero.var = FALSE, + study=exp, + test.keepX=seq(1,10,1)) + +plot(tune_res) +tune_res$choice.keepX # 9 variables to keep on component 1 + +## only tune component 3 and keeping 10 genes on comp1 +tune_res <- tune.mint.splsda(X = data, Y = type.id, ncomp = 2, study = exp, + already.tested.X = c(9), + test.keepX = seq(1,10,1)) +plot(tune_res) +tune_res$choice.keepX # 10 variables to keep on comp2 } \references{ Rohart F, Eslami A, Matigian, N, Bougeard S, Lê Cao K-A (2017). From c698b2ebd63c5928718fdea1ef95d764bdb0b265 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Mon, 25 Nov 2024 19:06:26 +1100 Subject: [PATCH 22/75] add unit tests for tune.mint.splsda --- tests/testthat/test-tune.mint.splsda.R | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/testthat/test-tune.mint.splsda.R b/tests/testthat/test-tune.mint.splsda.R index 23858a20..68b826b2 100644 --- a/tests/testthat/test-tune.mint.splsda.R +++ b/tests/testthat/test-tune.mint.splsda.R @@ -44,3 +44,27 @@ test_that("tune.mint.splsda works with custom alpha", code = { expect_equal(out$choice.ncomp$ncomp, 1) }) + +test_that("tune.mint.splsda works when test.keepX = NULL and gives same result as perf()", code = { + + # set up data + data(stemcells) + X = stemcells$gene + Y = stemcells$celltype + study = stemcells$study + + # tune on components only + tune_res <- suppressWarnings( + tune.mint.splsda(X, Y, study = study, ncomp = 2, + test.keepX = NULL) + ) + + # run perf + mint.splsda_res <- mint.splsda(X, Y, study = study, ncomp = 2) + perf_res <- suppressWarnings( + perf(mint.splsda_res, ncomp = 2, dist = "max.dist") + ) + + # check results match + expect_equal(tune_res$global.error$BER[1,1], perf_res$global.error$BER[1,1]) +}) From 72b15f25ac0eee88f3f15edb10e8cc34b26bb0f5 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Mon, 25 Nov 2024 19:10:40 +1100 Subject: [PATCH 23/75] no CV params in mint --- R/tune.mint.splsda.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/tune.mint.splsda.R b/R/tune.mint.splsda.R index 0c9e137d..0f0e3246 100644 --- a/R/tune.mint.splsda.R +++ b/R/tune.mint.splsda.R @@ -229,7 +229,6 @@ tune.mint.splsda <- mint.splsda_res <- mint.splsda(X, Y, ncomp = ncomp, study = study, scale = scale, tol = tol, max.iter = max.iter, near.zero.var = near.zero.var) perf_res <- perf(mint.splsda_res, - validation = validation, folds = folds, nrepeat = nrepeat, dist = dist, BPPARAM = BPPARAM, seed = seed, progressBar = progressBar) return(perf_res) From 5de7a1bffe53f45ebf0065699c643d6ed5711aa6 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Mon, 25 Nov 2024 19:10:48 +1100 Subject: [PATCH 24/75] init tune.mint.plsda --- R/tune.mint.plsda.R | 194 ++++++++++++++++++++++++++++++++++++++++ man/tune.mint.splsda.Rd | 72 ++++++++++++++- 2 files changed, 264 insertions(+), 2 deletions(-) create mode 100644 R/tune.mint.plsda.R diff --git a/R/tune.mint.plsda.R b/R/tune.mint.plsda.R new file mode 100644 index 00000000..575b8566 --- /dev/null +++ b/R/tune.mint.plsda.R @@ -0,0 +1,194 @@ +# ======================================================================================================== +# tune.mint.plsda: chose the optimal number of parameters per component on a mint.plsda method +# ======================================================================================================== +#' Estimate the parameters of mint.plsda method +#' +#' Computes Leave-One-Group-Out-Cross-Validation (LOGOCV) scores on a +#' user-input grid to determine optimal values for the parameters in +#' \code{mint.plsda}. +#' +#' This function performs a Leave-One-Group-Out-Cross-Validation (LOGOCV), +#' where each of \code{study} is left out once. +#' +#' The function outputs the optimal number of components that achieve the best +#' performance based on the overall error rate or BER. The assessment is +#' data-driven and similar to the process detailed in (Rohart et al., 2016), +#' where one-sided t-tests assess whether there is a gain in performance when +#' adding a component to the model. Our experience has shown that in most case, +#' the optimal number of components is the number of categories in \code{Y} - +#' 1, but it is worth tuning a few extra components to check (see our website +#' and case studies for more details). +#' +#' BER is appropriate in case of an unbalanced number of samples per class as +#' it calculates the average proportion of wrongly classified samples in each +#' class, weighted by the number of samples in each class. BER is less biased +#' towards majority classes during the performance assessment. +#' +#' More details about the prediction distances in \code{?predict} and the +#' supplemental material of the mixOmics article (Rohart et al. 2017). +#' +#' @param X numeric matrix of predictors. \code{NA}s are allowed. +#' @param Y Outcome. Numeric vector or matrix of responses (for multi-response +#' models) +#' @param ncomp Number of components to include in the model (see Details). +#' Default to 1 +#' @param study grouping factor indicating which samples are from the same +#' study +#' @param dist only applies to an object inheriting from \code{"plsda"} or +#' \code{"splsda"} to evaluate the classification performance of the model. +#' Should be a subset of \code{"max.dist"}, \code{"centroids.dist"}, +#' \code{"mahalanobis.dist"}. Default is \code{"all"}. See +#' \code{\link{predict}}. +#' @param measure Two misclassification measure are available: overall +#' misclassification error \code{overall} or the Balanced Error Rate \code{BER} +#' @param auc if \code{TRUE} calculate the Area Under the Curve (AUC) +#' performance of the model. +#' @param progressBar by default set to \code{TRUE} to output the progress bar +#' of the computation. +#' @param scale Logical. If scale = TRUE, each block is standardized to zero +#' means and unit variances (default: TRUE) +#' @param tol Convergence stopping value. +#' @param max.iter integer, the maximum number of iterations. +#' @param near.zero.var Logical, see the internal \code{\link{nearZeroVar}} +#' function (should be set to TRUE in particular for data with many zero +#' values). Default value is FALSE +#' @param light.output if set to FALSE, the prediction/classification of each +#' sample for each of \code{test.keepX} and each comp is returned. +#' @param signif.threshold numeric between 0 and 1 indicating the significance +#' threshold required for improvement in error rate of the components. Default +#' to 0.01. +#' @return The returned value is a list with components: +#' \item{study.specific.error}{A list that gives BER, overall error rate and +#' error rate per class, for each study} \item{global.error}{A list that gives +#' BER, overall error rate and error rate per class for all samples} +#' \item{predict}{A list of length \code{ncomp} that produces the predicted +#' values of each sample for each class} \item{class}{A list which gives the +#' predicted class of each sample for each \code{dist} and each of the +#' \code{ncomp} components. Directly obtained from the \code{predict} output.} +#' \item{auc}{AUC values} \item{auc.study}{AUC values for each study in mint +#' models}. +#' +#' @author Florian Rohart, Al J Abadi +#' @seealso \code{\link{mint.plsda}} and http://www.mixOmics.org for more +#' details. +#' @references Rohart F, Eslami A, Matigian, N, Bougeard S, Lê Cao K-A (2017). +#' MINT: A multivariate integrative approach to identify a reproducible +#' biomarker signature across multiple experiments and platforms. BMC +#' Bioinformatics 18:128. +#' +#' mixOmics article: +#' +#' Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics +#' feature selection and multiple data integration. PLoS Comput Biol 13(11): +#' e1005752 +#' @keywords multivariate dplot +#' @export +#' @example ./examples/tune.mint.plsda-examples.R + +tune.mint.splsda <- + function (X, + Y, + ncomp = 1, + study, + dist = c("max.dist", "centroids.dist", "mahalanobis.dist"), + measure = c("BER", "overall"), + auc = FALSE, + progressBar = FALSE, + scale = TRUE, + tol = 1e-06, + max.iter = 100, + near.zero.var = FALSE, + light.output = TRUE, # if FALSE, output the prediction and classification of each sample during each folds, on each comp, for each repeat + signif.threshold = 0.01 + ) + { + #-- checking general input parameters --------------------------------------# + #---------------------------------------------------------------------------# + + #------------------# + #-- check entries --# + if(missing(X)) + stop("'X'is missing", call. = FALSE) + + X = as.matrix(X) + + if (length(dim(X)) != 2 || !is.numeric(X)) + stop("'X' must be a numeric matrix.", call. = FALSE) + + + # Testing the input Y + if(missing(Y)) + stop("'Y'is missing", call. = FALSE) + if (is.null(Y)) + stop("'Y' has to be something else than NULL.", call. = FALSE) + + if (is.null(dim(Y))) + { + Y = factor(Y) + } else { + stop("'Y' should be a factor or a class vector.", call. = FALSE) + } + + if (nlevels(Y) == 1) + stop("'Y' should be a factor with more than one level", call. = FALSE) + + #-- check significance threshold + signif.threshold <- .check_alpha(signif.threshold) + + #-- progressBar + if (!is.logical(progressBar)) + stop("'progressBar' must be a logical constant (TRUE or FALSE).", call. = FALSE) + + + if (is.null(ncomp) || !is.numeric(ncomp) || ncomp <= 0) + stop("invalid number of variates, 'ncomp'.") + + + #-- measure + measure <- match.arg(measure) + + # -- check using the check of mint.splsda + Y.mat = unmap(Y) + colnames(Y.mat) = levels(Y) + + check = Check.entry.pls(X, Y = Y.mat, ncomp = ncomp, mode="regression", scale=scale, + near.zero.var=near.zero.var, max.iter=max.iter ,tol=tol ,logratio="none" ,DA=TRUE, multilevel=NULL) + X = check$X + ncomp = check$ncomp + + # -- study + #set the default study factor + if (missing(study)) + stop("'study' is missing", call. = FALSE) + + if (length(study) != nrow(X)) + stop(paste0("'study' must be a factor of length ",nrow(X),".")) + + if (any(table(study) <= 1)) + stop("At least one study has only one sample, please consider removing before calling the function again", call. = FALSE) + if (any(table(study) < 5)) + warning("At least one study has less than 5 samples, mean centering might not do as expected") + + if(sum(apply(table(Y,study)!=0,2,sum)==1) >0) + stop("At least one study only contains a single level of the multi-levels outcome Y. The MINT algorithm cannot be computed.") + + if(sum(apply(table(Y,study)==0,2,sum)>0) >0) + warning("At least one study does not contain all the levels of the outcome Y. The MINT algorithm might not perform as expected.") + + + #-- dist + dist = match.arg(dist) + + #-- light.output + if (!is.logical(light.output)) + stop("'light.output' must be either TRUE or FALSE", call. = FALSE) + + #-- run perf to tune ncomp --------------------------------------# + #---------------------------------------------------------------------------# + mint.plsda_res <- mint.plsda(X, Y, ncomp = ncomp, study = study, + scale = scale, tol = tol, max.iter = max.iter, near.zero.var = near.zero.var) + perf_res <- perf(mint.plsda_res, + dist = dist, + BPPARAM = BPPARAM, seed = seed, progressBar = progressBar) + return(perf_res) + } \ No newline at end of file diff --git a/man/tune.mint.splsda.Rd b/man/tune.mint.splsda.Rd index 566b3e36..8ec0f658 100644 --- a/man/tune.mint.splsda.Rd +++ b/man/tune.mint.splsda.Rd @@ -1,9 +1,28 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/tune.mint.splsda.R +% Please edit documentation in R/tune.mint.plsda.R, R/tune.mint.splsda.R \name{tune.mint.splsda} \alias{tune.mint.splsda} -\title{Estimate the parameters of mint.splsda method} +\title{Estimate the parameters of mint.plsda method} \usage{ +tune.mint.splsda( + X, + Y, + ncomp = 1, + study, + test.keepX = NULL, + already.tested.X, + dist = c("max.dist", "centroids.dist", "mahalanobis.dist"), + measure = c("BER", "overall"), + auc = FALSE, + progressBar = FALSE, + scale = TRUE, + tol = 1e-06, + max.iter = 100, + near.zero.var = FALSE, + light.output = TRUE, + signif.threshold = 0.01 +) + tune.mint.splsda( X, Y, @@ -76,6 +95,17 @@ threshold required for improvement in error rate of the components. Default to 0.01.} } \value{ +The returned value is a list with components: +\item{study.specific.error}{A list that gives BER, overall error rate and +error rate per class, for each study} \item{global.error}{A list that gives +BER, overall error rate and error rate per class for all samples} +\item{predict}{A list of length \code{ncomp} that produces the predicted +values of each sample for each class} \item{class}{A list which gives the +predicted class of each sample for each \code{dist} and each of the +\code{ncomp} components. Directly obtained from the \code{predict} output.} +\item{auc}{AUC values} \item{auc.study}{AUC values for each study in mint +models}. + The returned value is a list with components: \item{error.rate}{returns the prediction error for each \code{test.keepX} on each component, averaged across all repeats and subsampling folds. Standard @@ -102,6 +132,10 @@ predicted class of each sample for each \code{dist} and each of the models}. } \description{ +Computes Leave-One-Group-Out-Cross-Validation (LOGOCV) scores on a +user-input grid to determine optimal values for the parameters in +\code{mint.plsda}. + Computes Leave-One-Group-Out-Cross-Validation (LOGOCV) scores on a user-input grid to determine optimal values for the parameters in \code{mint.splsda}. @@ -110,6 +144,26 @@ user-input grid to determine optimal values for the parameters in This function performs a Leave-One-Group-Out-Cross-Validation (LOGOCV), where each of \code{study} is left out once. +The function outputs the optimal number of components that achieve the best +performance based on the overall error rate or BER. The assessment is +data-driven and similar to the process detailed in (Rohart et al., 2016), +where one-sided t-tests assess whether there is a gain in performance when +adding a component to the model. Our experience has shown that in most case, +the optimal number of components is the number of categories in \code{Y} - +1, but it is worth tuning a few extra components to check (see our website +and case studies for more details). + +BER is appropriate in case of an unbalanced number of samples per class as +it calculates the average proportion of wrongly classified samples in each +class, weighted by the number of samples in each class. BER is less biased +towards majority classes during the performance assessment. + +More details about the prediction distances in \code{?predict} and the +supplemental material of the mixOmics article (Rohart et al. 2017). + +This function performs a Leave-One-Group-Out-Cross-Validation (LOGOCV), +where each of \code{study} is left out once. + When \code{test.keepX} is not NULL, all component \eqn{1:\code{ncomp}} are tuned to identify number of variables to keep, except the first ones for which a \code{already.tested.X} is provided. See examples below. @@ -170,11 +224,25 @@ Bioinformatics 18:128. mixOmics article: +Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics +feature selection and multiple data integration. PLoS Comput Biol 13(11): +e1005752 + +Rohart F, Eslami A, Matigian, N, Bougeard S, Lê Cao K-A (2017). +MINT: A multivariate integrative approach to identify a reproducible +biomarker signature across multiple experiments and platforms. BMC +Bioinformatics 18:128. + +mixOmics article: + Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics feature selection and multiple data integration. PLoS Comput Biol 13(11): e1005752 } \seealso{ +\code{\link{mint.plsda}} and http://www.mixOmics.org for more +details. + \code{\link{mint.splsda}} and http://www.mixOmics.org for more details. } From bddda870a5309ff01b5e35c955b6909e76f37c15 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Mon, 25 Nov 2024 19:11:57 +1100 Subject: [PATCH 25/75] init tune.mint.plsda --- NAMESPACE | 1 + R/tune.mint.plsda.R | 2 +- man/tune.mint.plsda.Rd | 127 ++++++++++++++++++++++++++++++++++++++++ man/tune.mint.splsda.Rd | 72 +---------------------- 4 files changed, 131 insertions(+), 71 deletions(-) create mode 100644 man/tune.mint.plsda.Rd diff --git a/NAMESPACE b/NAMESPACE index 8391cdc6..24005b72 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -171,6 +171,7 @@ export(study_split) export(tune) export(tune.block.plsda) export(tune.block.splsda) +export(tune.mint.plsda) export(tune.mint.splsda) export(tune.pca) export(tune.pls) diff --git a/R/tune.mint.plsda.R b/R/tune.mint.plsda.R index 575b8566..48af7631 100644 --- a/R/tune.mint.plsda.R +++ b/R/tune.mint.plsda.R @@ -85,7 +85,7 @@ #' @export #' @example ./examples/tune.mint.plsda-examples.R -tune.mint.splsda <- +tune.mint.plsda <- function (X, Y, ncomp = 1, diff --git a/man/tune.mint.plsda.Rd b/man/tune.mint.plsda.Rd new file mode 100644 index 00000000..c73c9c63 --- /dev/null +++ b/man/tune.mint.plsda.Rd @@ -0,0 +1,127 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tune.mint.plsda.R +\name{tune.mint.plsda} +\alias{tune.mint.plsda} +\title{Estimate the parameters of mint.plsda method} +\usage{ +tune.mint.plsda( + X, + Y, + ncomp = 1, + study, + dist = c("max.dist", "centroids.dist", "mahalanobis.dist"), + measure = c("BER", "overall"), + auc = FALSE, + progressBar = FALSE, + scale = TRUE, + tol = 1e-06, + max.iter = 100, + near.zero.var = FALSE, + light.output = TRUE, + signif.threshold = 0.01 +) +} +\arguments{ +\item{X}{numeric matrix of predictors. \code{NA}s are allowed.} + +\item{Y}{Outcome. Numeric vector or matrix of responses (for multi-response +models)} + +\item{ncomp}{Number of components to include in the model (see Details). +Default to 1} + +\item{study}{grouping factor indicating which samples are from the same +study} + +\item{dist}{only applies to an object inheriting from \code{"plsda"} or +\code{"splsda"} to evaluate the classification performance of the model. +Should be a subset of \code{"max.dist"}, \code{"centroids.dist"}, +\code{"mahalanobis.dist"}. Default is \code{"all"}. See +\code{\link{predict}}.} + +\item{measure}{Two misclassification measure are available: overall +misclassification error \code{overall} or the Balanced Error Rate \code{BER}} + +\item{auc}{if \code{TRUE} calculate the Area Under the Curve (AUC) +performance of the model.} + +\item{progressBar}{by default set to \code{TRUE} to output the progress bar +of the computation.} + +\item{scale}{Logical. If scale = TRUE, each block is standardized to zero +means and unit variances (default: TRUE)} + +\item{tol}{Convergence stopping value.} + +\item{max.iter}{integer, the maximum number of iterations.} + +\item{near.zero.var}{Logical, see the internal \code{\link{nearZeroVar}} +function (should be set to TRUE in particular for data with many zero +values). Default value is FALSE} + +\item{light.output}{if set to FALSE, the prediction/classification of each +sample for each of \code{test.keepX} and each comp is returned.} + +\item{signif.threshold}{numeric between 0 and 1 indicating the significance +threshold required for improvement in error rate of the components. Default +to 0.01.} +} +\value{ +The returned value is a list with components: +\item{study.specific.error}{A list that gives BER, overall error rate and +error rate per class, for each study} \item{global.error}{A list that gives +BER, overall error rate and error rate per class for all samples} +\item{predict}{A list of length \code{ncomp} that produces the predicted +values of each sample for each class} \item{class}{A list which gives the +predicted class of each sample for each \code{dist} and each of the +\code{ncomp} components. Directly obtained from the \code{predict} output.} +\item{auc}{AUC values} \item{auc.study}{AUC values for each study in mint +models}. +} +\description{ +Computes Leave-One-Group-Out-Cross-Validation (LOGOCV) scores on a +user-input grid to determine optimal values for the parameters in +\code{mint.plsda}. +} +\details{ +This function performs a Leave-One-Group-Out-Cross-Validation (LOGOCV), +where each of \code{study} is left out once. + +The function outputs the optimal number of components that achieve the best +performance based on the overall error rate or BER. The assessment is +data-driven and similar to the process detailed in (Rohart et al., 2016), +where one-sided t-tests assess whether there is a gain in performance when +adding a component to the model. Our experience has shown that in most case, +the optimal number of components is the number of categories in \code{Y} - +1, but it is worth tuning a few extra components to check (see our website +and case studies for more details). + +BER is appropriate in case of an unbalanced number of samples per class as +it calculates the average proportion of wrongly classified samples in each +class, weighted by the number of samples in each class. BER is less biased +towards majority classes during the performance assessment. + +More details about the prediction distances in \code{?predict} and the +supplemental material of the mixOmics article (Rohart et al. 2017). +} +\references{ +Rohart F, Eslami A, Matigian, N, Bougeard S, Lê Cao K-A (2017). +MINT: A multivariate integrative approach to identify a reproducible +biomarker signature across multiple experiments and platforms. BMC +Bioinformatics 18:128. + +mixOmics article: + +Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics +feature selection and multiple data integration. PLoS Comput Biol 13(11): +e1005752 +} +\seealso{ +\code{\link{mint.plsda}} and http://www.mixOmics.org for more +details. +} +\author{ +Florian Rohart, Al J Abadi +} +\keyword{dplot} +\keyword{multivariate} diff --git a/man/tune.mint.splsda.Rd b/man/tune.mint.splsda.Rd index 8ec0f658..566b3e36 100644 --- a/man/tune.mint.splsda.Rd +++ b/man/tune.mint.splsda.Rd @@ -1,28 +1,9 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/tune.mint.plsda.R, R/tune.mint.splsda.R +% Please edit documentation in R/tune.mint.splsda.R \name{tune.mint.splsda} \alias{tune.mint.splsda} -\title{Estimate the parameters of mint.plsda method} +\title{Estimate the parameters of mint.splsda method} \usage{ -tune.mint.splsda( - X, - Y, - ncomp = 1, - study, - test.keepX = NULL, - already.tested.X, - dist = c("max.dist", "centroids.dist", "mahalanobis.dist"), - measure = c("BER", "overall"), - auc = FALSE, - progressBar = FALSE, - scale = TRUE, - tol = 1e-06, - max.iter = 100, - near.zero.var = FALSE, - light.output = TRUE, - signif.threshold = 0.01 -) - tune.mint.splsda( X, Y, @@ -95,17 +76,6 @@ threshold required for improvement in error rate of the components. Default to 0.01.} } \value{ -The returned value is a list with components: -\item{study.specific.error}{A list that gives BER, overall error rate and -error rate per class, for each study} \item{global.error}{A list that gives -BER, overall error rate and error rate per class for all samples} -\item{predict}{A list of length \code{ncomp} that produces the predicted -values of each sample for each class} \item{class}{A list which gives the -predicted class of each sample for each \code{dist} and each of the -\code{ncomp} components. Directly obtained from the \code{predict} output.} -\item{auc}{AUC values} \item{auc.study}{AUC values for each study in mint -models}. - The returned value is a list with components: \item{error.rate}{returns the prediction error for each \code{test.keepX} on each component, averaged across all repeats and subsampling folds. Standard @@ -132,10 +102,6 @@ predicted class of each sample for each \code{dist} and each of the models}. } \description{ -Computes Leave-One-Group-Out-Cross-Validation (LOGOCV) scores on a -user-input grid to determine optimal values for the parameters in -\code{mint.plsda}. - Computes Leave-One-Group-Out-Cross-Validation (LOGOCV) scores on a user-input grid to determine optimal values for the parameters in \code{mint.splsda}. @@ -144,26 +110,6 @@ user-input grid to determine optimal values for the parameters in This function performs a Leave-One-Group-Out-Cross-Validation (LOGOCV), where each of \code{study} is left out once. -The function outputs the optimal number of components that achieve the best -performance based on the overall error rate or BER. The assessment is -data-driven and similar to the process detailed in (Rohart et al., 2016), -where one-sided t-tests assess whether there is a gain in performance when -adding a component to the model. Our experience has shown that in most case, -the optimal number of components is the number of categories in \code{Y} - -1, but it is worth tuning a few extra components to check (see our website -and case studies for more details). - -BER is appropriate in case of an unbalanced number of samples per class as -it calculates the average proportion of wrongly classified samples in each -class, weighted by the number of samples in each class. BER is less biased -towards majority classes during the performance assessment. - -More details about the prediction distances in \code{?predict} and the -supplemental material of the mixOmics article (Rohart et al. 2017). - -This function performs a Leave-One-Group-Out-Cross-Validation (LOGOCV), -where each of \code{study} is left out once. - When \code{test.keepX} is not NULL, all component \eqn{1:\code{ncomp}} are tuned to identify number of variables to keep, except the first ones for which a \code{already.tested.X} is provided. See examples below. @@ -224,25 +170,11 @@ Bioinformatics 18:128. mixOmics article: -Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics -feature selection and multiple data integration. PLoS Comput Biol 13(11): -e1005752 - -Rohart F, Eslami A, Matigian, N, Bougeard S, Lê Cao K-A (2017). -MINT: A multivariate integrative approach to identify a reproducible -biomarker signature across multiple experiments and platforms. BMC -Bioinformatics 18:128. - -mixOmics article: - Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics feature selection and multiple data integration. PLoS Comput Biol 13(11): e1005752 } \seealso{ -\code{\link{mint.plsda}} and http://www.mixOmics.org for more -details. - \code{\link{mint.splsda}} and http://www.mixOmics.org for more details. } From b4a6e718c22d9cdf42f8c27ffaab03b497cbc805 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Mon, 25 Nov 2024 19:14:13 +1100 Subject: [PATCH 26/75] update examples of tune.mint.plsda --- examples/tune.mint.plsda-examples.R | 13 +++++++++++++ man/tune.mint.plsda.Rd | 15 +++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 examples/tune.mint.plsda-examples.R diff --git a/examples/tune.mint.plsda-examples.R b/examples/tune.mint.plsda-examples.R new file mode 100644 index 00000000..2c08350d --- /dev/null +++ b/examples/tune.mint.plsda-examples.R @@ -0,0 +1,13 @@ +# set up data +data(stemcells) +data <- stemcells$gene +type.id <- stemcells$celltype +exp <- stemcells$study + +# tune number of components +tune_res <- tune.mint.plsda(X = data,Y = type.id, ncomp=5, + near.zero.var=FALSE, + study=exp) + +plot(tune_res) +tune_res$choice.ncomp # 1 component \ No newline at end of file diff --git a/man/tune.mint.plsda.Rd b/man/tune.mint.plsda.Rd index c73c9c63..24344086 100644 --- a/man/tune.mint.plsda.Rd +++ b/man/tune.mint.plsda.Rd @@ -104,6 +104,21 @@ towards majority classes during the performance assessment. More details about the prediction distances in \code{?predict} and the supplemental material of the mixOmics article (Rohart et al. 2017). } +\examples{ +# set up data +data(stemcells) +data <- stemcells$gene +type.id <- stemcells$celltype +exp <- stemcells$study + +# tune number of components +tune_res <- tune.mint.plsda(X = data,Y = type.id, ncomp=5, + near.zero.var=FALSE, + study=exp) + +plot(tune_res) +tune_res$choice.ncomp # 1 component +} \references{ Rohart F, Eslami A, Matigian, N, Bougeard S, Lê Cao K-A (2017). MINT: A multivariate integrative approach to identify a reproducible From 4ed248f58613e02ef41c79c5a1dbb9bf216e1a27 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Tue, 26 Nov 2024 13:55:20 +1100 Subject: [PATCH 27/75] update tune.rcc to remove plot arg to streamline --- R/tune.block.plsda.R | 7 ------- R/tune.block.splsda.R | 19 ++++++++----------- R/tune.rcc.R | 16 ++-------------- examples/tune.rcc-examples.R | 10 ++++++++++ man/tune.rcc.Rd | 11 +++++------ 5 files changed, 25 insertions(+), 38 deletions(-) create mode 100644 examples/tune.rcc-examples.R diff --git a/R/tune.block.plsda.R b/R/tune.block.plsda.R index 230dd18a..aea416f9 100644 --- a/R/tune.block.plsda.R +++ b/R/tune.block.plsda.R @@ -189,13 +189,6 @@ tune.block.plsda <- } return(zm) }) - - #-- dist - dist = match.arg( - dist, - choices = c("max.dist", "centroids.dist", "mahalanobis.dist"), - several.ok = TRUE - ) #-- progressBar if (!is.logical(progressBar)) diff --git a/R/tune.block.splsda.R b/R/tune.block.splsda.R index 978579ce..f7d72380 100644 --- a/R/tune.block.splsda.R +++ b/R/tune.block.splsda.R @@ -225,13 +225,6 @@ tune.block.splsda <- } return(zm) }) - - #-- dist - dist = match.arg( - dist, - choices = c("max.dist", "centroids.dist", "mahalanobis.dist"), - several.ok = TRUE - ) #-- progressBar if (!is.logical(progressBar)) @@ -242,7 +235,6 @@ tune.block.splsda <- if (is.null(ncomp) || !is.numeric(ncomp) || ncomp <= 0) stop("invalid number of variates, 'ncomp'.") - #-- validation choices = c("Mfold", "loo") validation = choices[pmatch(validation, choices)] @@ -265,7 +257,6 @@ tune.block.splsda <- signif.threshold <- .check_alpha(signif.threshold) #-- already.tested.X - if (missing(already.tested.X)) { already.tested.X = NULL @@ -298,7 +289,7 @@ tune.block.splsda <- #-- test.keepX if (is.null(test.keepX)){ print("test.keepX is set to NULL, tuning only for number of components...") - block.splsda_res <- block.splsda(X, Y, ncomp = ncomp, keepX = setNames(as.list(rep(nrow(X[[1]]), length(names(X)))), names(X)), + block.splsda_res <- block.splsda(X, Y, ncomp = ncomp, scale = scale, tol = tol, max.iter = max.iter, near.zero.var = near.zero.var, design = design) perf_res <- perf(block.splsda_res, validation = validation, folds = folds, nrepeat = nrepeat, @@ -307,7 +298,13 @@ tune.block.splsda <- return(perf_res) } else { - + + #-- dist (for tune can only have one dist, for perf can have multiple) + dist = match.arg( + dist, + choices = c("max.dist", "centroids.dist", "mahalanobis.dist"), + several.ok = TRUE + ) if (missing(test.keepX)) { diff --git a/R/tune.rcc.R b/R/tune.rcc.R index a3eaa5c9..2dc5fb77 100644 --- a/R/tune.rcc.R +++ b/R/tune.rcc.R @@ -33,8 +33,6 @@ library(BiocParallel) #' \code{"Mfolds"} (M-folds). See Details. #' @param folds positive integer. Number of folds to use if #' \code{validation="Mfold"}. Defaults to \code{folds=10}. -#' @param plot logical argument indicating whether an image map should be -#' plotted by calling the \code{imgCV} function. #' @param BPPARAM a BiocParallel parameter object; see \code{BiocParallel::bpparam} #' for details. Default is \code{MulticoreParam()} for parallel processing. #' @param seed set a number here if you want the function to give reproducible outputs. @@ -50,14 +48,8 @@ library(BiocParallel) #' details. #' @keywords multivariate dplot #' @export -#' @examples -#' data(nutrimouse) -#' X <- nutrimouse$lipid -#' Y <- nutrimouse$gene -#' -#' ## this can take some seconds -#' -#' tune.rcc(X, Y, validation = "Mfold") +#' @example ./examples/tune.rcc-examples.R + tune.rcc <- function(X, Y, @@ -65,7 +57,6 @@ tune.rcc <- grid2 = seq(0.001, 1, length = 5), validation = c("loo", "Mfold"), folds = 10, - plot = TRUE, BPPARAM = SerialParam(), seed = NULL) { @@ -119,9 +110,6 @@ tune.rcc <- cv.score.grid = cbind(grid, cv.score) mat = matrix(cv.score, nrow = length(grid1), ncol = length(grid2)) - if (isTRUE(plot)) - image.tune.rcc(list(grid1 = grid1, grid2 = grid2, mat = mat)) - opt = cv.score.grid[cv.score.grid[, 3] == max(cv.score.grid[, 3]), ] out = list(opt.lambda1 = opt[[1]], opt.lambda2 = opt[[2]], diff --git a/examples/tune.rcc-examples.R b/examples/tune.rcc-examples.R new file mode 100644 index 00000000..479450bd --- /dev/null +++ b/examples/tune.rcc-examples.R @@ -0,0 +1,10 @@ +#load data +data(nutrimouse) +X <- nutrimouse$lipid +Y <- nutrimouse$gene + +# run tuning +tune_res <- tune.rcc(X, Y, validation = "Mfold") + +# plot output +plot(tune_res) \ No newline at end of file diff --git a/man/tune.rcc.Rd b/man/tune.rcc.Rd index 347342dc..6f43fd7b 100644 --- a/man/tune.rcc.Rd +++ b/man/tune.rcc.Rd @@ -11,7 +11,6 @@ tune.rcc( grid2 = seq(0.001, 1, length = 5), validation = c("loo", "Mfold"), folds = 10, - plot = TRUE, BPPARAM = SerialParam(), seed = NULL ) @@ -34,9 +33,6 @@ method to use, (partially) matching one of \code{"loo"} (leave-one-out) or \item{folds}{positive integer. Number of folds to use if \code{validation="Mfold"}. Defaults to \code{folds=10}.} -\item{plot}{logical argument indicating whether an image map should be -plotted by calling the \code{imgCV} function.} - \item{BPPARAM}{a BiocParallel parameter object; see \code{BiocParallel::bpparam} for details. Default is \code{MulticoreParam()} for parallel processing.} @@ -73,13 +69,16 @@ of the data matrix using the \code{nipals} function. Otherwise, missing values are handled by casewise deletion in the \code{rcc} function. } \examples{ +#load data data(nutrimouse) X <- nutrimouse$lipid Y <- nutrimouse$gene -## this can take some seconds +# run tuning +tune_res <- tune.rcc(X, Y, validation = "Mfold") -tune.rcc(X, Y, validation = "Mfold") +# plot output +plot(tune_res) } \seealso{ \code{\link{image.tune.rcc}} and http://www.mixOmics.org for more From 71710b493a28d9384136f92eee84f6acd7197a35 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Tue, 26 Nov 2024 14:02:56 +1100 Subject: [PATCH 28/75] start streamlining tune wrapper function --- R/tune.R | 305 ++++++++++++++++++++++++---------------------------- man/tune.Rd | 137 ++++++++++++----------- 2 files changed, 210 insertions(+), 232 deletions(-) diff --git a/R/tune.R b/R/tune.R index 13bb1dc9..1ebfde2f 100644 --- a/R/tune.R +++ b/R/tune.R @@ -82,8 +82,6 @@ #' threshold required for improvement in error rate of the components. Default #' to 0.01. #' @param V Matrix used in the logratio transformation id provided (for tune.pca) -#' @param plot logical argument indicating whether an image map should be -#' plotted by calling the \code{imgCV} function. (for tune.rcc) #' @param BPPARAM A \linkS4class{BiocParallelParam} object indicating the type #' of parallelisation. See examples. #' @param seed set a number here if you want the function to give reproducible outputs. @@ -165,202 +163,136 @@ #' @export #' @example ./examples/tune-examples.R tune <- - function (method = c("spls", "splsda", "block.splsda", "mint.splsda", "rcc", "pca", "spca"), + function (method = c("pls", "spls", "plsda", "splsda", "block.plsda", "block.splsda", "mint.plsda", "mint.splsda", "rcc", "pca", "spca"), # Input data X, Y, + # Sparse params test.keepX = c(5, 10, 15), test.keepY = NULL, already.tested.X, already.tested.Y, # Number of components ncomp, - # PLS mode - mode = c("regression", "canonical", "invariant", "classic"), - # How to do the cross-validation + + ## Params related to model building + V, # PCA + center = TRUE, # PCA + grid1 = seq(0.001, 1, length = 5), # CCA + grid2 = seq(0.001, 1, length = 5), # CCA + mode = c("regression", "canonical", "invariant", "classic"), # PLS + indY, # block PLSDA + weighted = TRUE, # block PLSDA + design, # block PLSDA + study, # MINT PLSDA + tol = 1e-09, # PLSDA, block PLSDA, mint PLSDA, PCA + scale = TRUE, # PLS, PLSDA, block PLSDA, mint PLSDA, PCA + logratio = c('none','CLR'), # PLS, PLSDA, PCA + near.zero.var = FALSE, # PLS, PLSDA, block PLSDA, mint PLSDA + max.iter = 100, # PLS, PLSDA, block PLSDA, mint PLSDA, PCA + multilevel = NULL, # PLS, PLSDA, PCA + + ## Params related to cross-validation + validation = "Mfold", nrepeat = 1, folds = 10, - validation = "Mfold", - max.iter = 100, - tol = 1e-09, signif.threshold = 0.01, - # How to transform data - logratio = c('none','CLR'), - V, - center = TRUE, - scale = TRUE, - near.zero.var = FALSE, - # How to measure accuracy + + # How to measure accuracy - only for DA methods dist = "max.dist", measure = ifelse(method == "spls", "cor", "BER"), - # Multilevel - multilevel = NULL, + auc = FALSE, + # Running params seed = NULL, BPPARAM = SerialParam(), progressBar = FALSE, + # Output params - auc = FALSE, - light.output = TRUE, - plot = FALSE, - # Multiblock specific params - indY, - weighted = TRUE, - design, - # CCA specific params - grid1 = seq(0.001, 1, length = 5), - grid2 = seq(0.001, 1, length = 5), - # MINT specific params - study + light.output = TRUE + ) { + + ## ----------- checks ----------- ## + method = match.arg(method) mode <- match.arg(mode) # hardcode scheme and init scheme = "horst" init = "svd.single" - - if (method == "mint.splsda") { - message("Calling 'tune.mint.splsda' with Leave-One-Group-Out Cross Validation (nrepeat = 1)") - - if (missing(ncomp)) - ncomp = 1 - - result = tune.mint.splsda(X = X, Y = Y, - ncomp = ncomp, - study = study, - test.keepX = test.keepX, - already.tested.X = already.tested.X, - dist = dist, - measure = measure, - auc = auc, - progressBar = progressBar, - scale = scale, - tol = tol, - max.iter = max.iter, - near.zero.var = near.zero.var, - light.output = light.output) - - } else if (method == "block.splsda") { - message("Calling 'tune.block.splsda'") - - result = tune.block.splsda(X = X, - Y = Y, - ncomp = ncomp, - test.keepX = test.keepX, - already.tested.X = already.tested.X, - validation = validation, - folds = folds, - dist = dist, - measure = measure, - progressBar = progressBar, - scale = scale, - tol = tol, - max.iter = max.iter, - near.zero.var = near.zero.var, - light.output = light.output, - nrepeat = nrepeat, - BPPARAM = BPPARAM, - indY = indY, - weighted = weighted, - design = design, - scheme = scheme, - init = init, - signif.threshold = signif.threshold, - seed = seed - ) - - - } else if (method == "rcc") { - message("Calling 'tune.rcc'") - - result = tune.rcc(X = X, - Y = Y, - grid1 = grid1, - grid2 = grid2, - validation = validation, - folds = folds, - plot = plot, - BPPARAM = BPPARAM, - seed = seed) - - } else if (method == "pca") { + + + ## ----------- PCA ----------- ## + + if (method == "pca") { message("Calling 'tune.pca'") if (missing(ncomp)) ncomp = NULL - result = tune.pca(X = X, - ncomp = ncomp, - center = center, - scale = scale, - max.iter = max.iter, - tol = tol) - - + result = tune.pca( + # model building params + X = X, ncomp = ncomp, + center = center, scale = scale, max.iter = max.iter, tol = tol) + + ## ----------- sPCA ----------- ## + } else if (method == "spca") { message("Calling 'tune.spca'") if (missing(ncomp)) ncomp = 2 #TODO use match.call() arg matching for these function calls - result = tune.spca(X = X, - ncomp = ncomp, - nrepeat = nrepeat, - folds = folds, - test.keepX = test.keepX, - center = center, - scale = scale, - BPPARAM = BPPARAM, - seed = seed) - - - } else if (method == "splsda") { - - message("Calling 'tune.splsda'") - - if (missing(ncomp)) - ncomp = 1 - - result = tune.splsda (X = X, Y = Y, - ncomp = ncomp, - test.keepX = test.keepX, - already.tested.X = already.tested.X, - validation = validation, - folds = folds, - dist = dist , - measure = measure, - auc = auc, - progressBar = progressBar, - max.iter = max.iter, - near.zero.var = near.zero.var, - nrepeat = nrepeat, - logratio = logratio, - multilevel = multilevel, - light.output = light.output, - seed = seed) + result = tune.spca( + # model building params + X = X, ncomp = ncomp, + center = center, scale = scale, + # sparsity params + test.keepX = test.keepX, + # CV params + nrepeat = nrepeat, folds = folds, + # running params + BPPARAM = BPPARAM, seed = seed) + + + ## ----------- CCA ----------- ## + + } else if (method == "rcc") { + message("Calling 'tune.rcc'") + result = tune.rcc( + # model building params + X = X, Y = Y, + # sparsity params + grid1 = grid1, grid2 = grid2, + # CV params + validation = validation, folds = folds, + # running params + BPPARAM = BPPARAM, seed = seed) + + ## ----------- sPLS +/- multilevel ----------- ## + } else if (method == "spls") { if(missing(multilevel)) { message("Calling 'tune.spls'") - result = tune.spls(X = X, - Y = Y, - test.keepX = test.keepX, - test.keepY = test.keepY, - ncomp = ncomp, - validation = validation, - nrepeat = nrepeat, - folds = folds, - mode = mode, - measure = measure, - BPPARAM = BPPARAM, - progressBar = progressBar, - seed = seed - ) - } else { + result = tune.spls( + # model building params + X = X, Y = Y, ncomp = ncomp, mode = mode, + # sparsity params + test.keepX = test.keepX, test.keepY = test.keepY, + # CV params + validation = validation, nrepeat = nrepeat, folds = folds, + # PA params + measure = measure, + # running params + progressBar = progressBar, + BPPARAM = BPPARAM, seed = seed) + + } else { message("Calling 'tune.splslevel' with method = 'spls'") if (missing(ncomp)) @@ -370,15 +302,64 @@ tune <- if (missing(already.tested.Y)) already.tested.Y = NULL - result = tune.splslevel(X = X, Y = Y, - multilevel = multilevel, - mode = mode, - ncomp = ncomp, test.keepX = test.keepX, test.keepY = test.keepY, - already.tested.X = already.tested.X, already.tested.Y = already.tested.Y, - BPPARAM = BPPARAM, seed = seed) + result = tune.splslevel( + # model building params + X = X, Y = Y, ncomp = ncomp, mode = mode, + # multilevel params + multilevel = multilevel, + # sparsity params + test.keepX = test.keepX, test.keepY = test.keepY, + already.tested.X = already.tested.X, already.tested.Y = already.tested.Y, + # running params + BPPARAM = BPPARAM, seed = seed) } - } - + + + ## ----------- block.splsda ----------- ## + + } else if (method == "block.splsda") { + message("Calling 'tune.block.splsda'") + + result = tune.block.splsda( + # model building params + X = X, Y = Y, indY = indY, ncomp = ncomp, design = design, + tol = tol, scale = scale, max.iter = max.iter, near.zero.var = near.zero.var, + scheme = scheme, init = init, # hardcoded above + # sparsity params + test.keepX = test.keepX, already.tested.X = already.tested.X, + # CV params + validation = validation, folds = folds, nrepeat = nrepeat, + # PA params + dist = dist, measure = measure, + weighted = weighted, signif.threshold = signif.threshold, + # running params + progressBar = progressBar, light.output = light.output, + BPPARAM = BPPARAM, seed = seed) + + + ## ----------- mint.splsda ----------- ## + + } else if (method == "mint.splsda") { + message("Calling 'tune.mint.splsda' with Leave-One-Group-Out Cross Validation (nrepeat = 1)") + + if (missing(ncomp)) + ncomp = 1 + + result = tune.mint.splsda( + # model building params + X = X, Y = Y, ncomp = ncomp, study = study, + scale = scale, tol = tol, max.iter = max.iter, near.zero.var = near.zero.var, + # sparsity params + test.keepX = test.keepX, already.tested.X = already.tested.X, + # PA params + dist = dist, measure = measure, auc = auc, + # running params - no need for seed or BPPARAM due to LOO CV + progressBar = progressBar,light.output = light.output) + + } + + ## ----------- end ----------- ## + result$call = match.call() return(result) } diff --git a/man/tune.Rd b/man/tune.Rd index d77fa96b..e9fe06fa 100644 --- a/man/tune.Rd +++ b/man/tune.Rd @@ -5,7 +5,8 @@ \title{Wrapper function to tune pls-derived methods.} \usage{ tune( - method = c("spls", "splsda", "block.splsda", "mint.splsda", "rcc", "pca", "spca"), + method = c("pls", "spls", "plsda", "splsda", "block.plsda", "block.splsda", + "mint.plsda", "mint.splsda", "rcc", "pca", "spca"), X, Y, test.keepX = c(5, 10, 15), @@ -13,33 +14,32 @@ tune( already.tested.X, already.tested.Y, ncomp, - mode = c("regression", "canonical", "invariant", "classic"), - nrepeat = 1, - folds = 10, - validation = "Mfold", - max.iter = 100, - tol = 1e-09, - signif.threshold = 0.01, - logratio = c("none", "CLR"), V, center = TRUE, + grid1 = seq(0.001, 1, length = 5), + grid2 = seq(0.001, 1, length = 5), + mode = c("regression", "canonical", "invariant", "classic"), + indY, + weighted = TRUE, + design, + study, + tol = 1e-09, scale = TRUE, + logratio = c("none", "CLR"), near.zero.var = FALSE, + max.iter = 100, + multilevel = NULL, + validation = "Mfold", + nrepeat = 1, + folds = 10, + signif.threshold = 0.01, dist = "max.dist", measure = ifelse(method == "spls", "cor", "BER"), - multilevel = NULL, + auc = FALSE, seed = NULL, BPPARAM = SerialParam(), progressBar = FALSE, - auc = FALSE, - light.output = TRUE, - plot = FALSE, - indY, - weighted = TRUE, - design, - grid1 = seq(0.001, 1, length = 5), - grid2 = seq(0.001, 1, length = 5), - study + light.output = TRUE ) } \arguments{ @@ -69,35 +69,42 @@ data set on the first components} \item{ncomp}{the number of components to include in the model.} +\item{V}{Matrix used in the logratio transformation id provided (for tune.pca)} + +\item{center}{a logical value indicating whether the variables should be +shifted to be zero centered. Alternately, a vector of length equal the +number of columns of \code{X} can be supplied. The value is passed to +\code{\link{scale}}.} + +\item{grid1, grid2}{vector numeric defining the values of \code{lambda1} and +\code{lambda2} at which cross-validation score should be computed. Defaults +to \code{grid1=grid2=seq(0.001, 1, length=5)}.} + \item{mode}{character string. What type of algorithm to use, (partially) matching one of \code{"regression"}, \code{"canonical"}, \code{"invariant"} or \code{"classic"}. See Details.} -\item{nrepeat}{Number of times the Cross-Validation process is repeated.} +\item{indY}{To supply if \code{Y} is missing, indicates the position of +the matrix response in the list \code{X}.} -\item{folds}{the folds in the Mfold cross-validation. See Details.} +\item{weighted}{tune using either the performance of the Majority vote or +the Weighted vote.} -\item{validation}{character. What kind of (internal) validation to use, -matching one of \code{"Mfold"} or \code{"loo"} (see below). Default is -\code{"Mfold"}.} +\item{design}{numeric matrix of size (number of blocks in X) x (number of +blocks in X) with values between 0 and 1. Each value indicates the strenght +of the relationship to be modelled between two blocks; a value of 0 +indicates no relationship, 1 is the maximum value. Alternatively, one of +c('null', 'full') indicating a disconnected or fully connected design, +respecively, or a numeric between 0 and 1 which will designate all +off-diagonal elements of a fully connected design (see examples in +\code{block.splsda}). If \code{Y} is provided instead of \code{indY}, the +\code{design} matrix is changed to include relationships to \code{Y}.} -\item{max.iter}{Integer, the maximum number of iterations.} +\item{study}{grouping factor indicating which samples are from the same +study} \item{tol}{Numeric, convergence tolerance criteria.} -\item{signif.threshold}{numeric between 0 and 1 indicating the significance -threshold required for improvement in error rate of the components. Default -to 0.01.} - -\item{logratio}{one of ('none','CLR'). Default to 'none'} - -\item{V}{Matrix used in the logratio transformation id provided (for tune.pca)} - -\item{center}{a logical value indicating whether the variables should be -shifted to be zero centered. Alternately, a vector of length equal the -number of columns of \code{X} can be supplied. The value is passed to -\code{\link{scale}}.} - \item{scale}{a logical value indicating whether the variables should be scaled to have unit variance before the analysis takes place. The default is \code{FALSE} for consistency with \code{prcomp} function, but in general @@ -105,19 +112,38 @@ scaling is advisable. Alternatively, a vector of length equal the number of columns of \code{X} can be supplied. The value is passed to \code{\link{scale}}.} +\item{logratio}{one of ('none','CLR'). Default to 'none'} + \item{near.zero.var}{Logical, see the internal \code{\link{nearZeroVar}} function (should be set to TRUE in particular for data with many zero values). Default value is FALSE} +\item{max.iter}{Integer, the maximum number of iterations.} + +\item{multilevel}{Design matrix for multilevel analysis (for repeated +measurements) that indicates the repeated measures on each individual, i.e. +the individuals ID. See Details.} + +\item{validation}{character. What kind of (internal) validation to use, +matching one of \code{"Mfold"} or \code{"loo"} (see below). Default is +\code{"Mfold"}.} + +\item{nrepeat}{Number of times the Cross-Validation process is repeated.} + +\item{folds}{the folds in the Mfold cross-validation. See Details.} + +\item{signif.threshold}{numeric between 0 and 1 indicating the significance +threshold required for improvement in error rate of the components. Default +to 0.01.} + \item{dist}{distance metric to estimate the classification error rate, should be a subset of \code{"centroids.dist"}, \code{"mahalanobis.dist"} or \code{"max.dist"} (see Details).} \item{measure}{The tuning measure used for different methods. See details.} -\item{multilevel}{Design matrix for multilevel analysis (for repeated -measurements) that indicates the repeated measures on each individual, i.e. -the individuals ID. See Details.} +\item{auc}{if \code{TRUE} calculate the Area Under the Curve (AUC) +performance of the model.} \item{seed}{set a number here if you want the function to give reproducible outputs. Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'.} @@ -128,37 +154,8 @@ of parallelisation. See examples.} \item{progressBar}{by default set to \code{TRUE} to output the progress bar of the computation.} -\item{auc}{if \code{TRUE} calculate the Area Under the Curve (AUC) -performance of the model.} - \item{light.output}{if set to FALSE, the prediction/classification of each sample for each of \code{test.keepX} and each comp is returned.} - -\item{plot}{logical argument indicating whether an image map should be -plotted by calling the \code{imgCV} function. (for tune.rcc)} - -\item{indY}{To supply if \code{Y} is missing, indicates the position of -the matrix response in the list \code{X}.} - -\item{weighted}{tune using either the performance of the Majority vote or -the Weighted vote.} - -\item{design}{numeric matrix of size (number of blocks in X) x (number of -blocks in X) with values between 0 and 1. Each value indicates the strenght -of the relationship to be modelled between two blocks; a value of 0 -indicates no relationship, 1 is the maximum value. Alternatively, one of -c('null', 'full') indicating a disconnected or fully connected design, -respecively, or a numeric between 0 and 1 which will designate all -off-diagonal elements of a fully connected design (see examples in -\code{block.splsda}). If \code{Y} is provided instead of \code{indY}, the -\code{design} matrix is changed to include relationships to \code{Y}.} - -\item{grid1, grid2}{vector numeric defining the values of \code{lambda1} and -\code{lambda2} at which cross-validation score should be computed. Defaults -to \code{grid1=grid2=seq(0.001, 1, length=5)}.} - -\item{study}{grouping factor indicating which samples are from the same -study} } \value{ Depending on the type of analysis performed and the input arguments, From 22044538fcded064f99e038a7200449ce14c6ac1 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Tue, 26 Nov 2024 14:12:51 +1100 Subject: [PATCH 29/75] streamline tune wrapper with exisiting functions --- R/tune.R | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/R/tune.R b/R/tune.R index 1ebfde2f..08ff32f5 100644 --- a/R/tune.R +++ b/R/tune.R @@ -314,6 +314,29 @@ tune <- BPPARAM = BPPARAM, seed = seed) } + ## ----------- splsda ----------- ## + + } else if (method == "splsda") { + + message("Calling 'tune.splsda'") + + if (missing(ncomp)) + ncomp = 1 + + result = tune.splsda( + # model building params + X = X, Y = Y, ncomp = ncomp, multilevel = multilevel, + logratio = logratio, max.iter = max.iter, near.zero.var = near.zero.var, + # sparsity params + test.keepX = test.keepX, already.tested.X = already.tested.X, + # CV params + validation = validation, folds = folds, nrepeat = nrepeat, + # PA params + dist = dist, measure = measure, auc = auc, + # running params + progressBar = progressBar, light.output = light.output, + BPPARAM = BPPARAM, seed = seed) + ## ----------- block.splsda ----------- ## From a9fd254b0e93823a72e1e32fccd53e7bc8f63b7d Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Tue, 26 Nov 2024 14:18:12 +1100 Subject: [PATCH 30/75] update unit test for removing plot from tune.rcc --- tests/testthat/test-tune.rcc.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/testthat/test-tune.rcc.R b/tests/testthat/test-tune.rcc.R index 921d6d50..26851c14 100644 --- a/tests/testthat/test-tune.rcc.R +++ b/tests/testthat/test-tune.rcc.R @@ -8,7 +8,7 @@ test_that("tune.rcc works with Mfold method", code = { Y <- nutrimouse$gene # run - tune.rcc.res <- tune.rcc(X, Y, validation = "Mfold", plot = FALSE, seed = 20) + tune.rcc.res <- tune.rcc(X, Y, validation = "Mfold", seed = 20) # check outputs expect_equal(class(tune.rcc.res), "tune.rcc") @@ -23,7 +23,7 @@ test_that("tune.rcc works with loo method", code = { Y <- nutrimouse$gene # run - tune.rcc.res <- tune.rcc(X, Y, validation = "loo", plot = FALSE, seed = 20) + tune.rcc.res <- tune.rcc(X, Y, validation = "loo", seed = 20) # check outputs expect_equal(class(tune.rcc.res), "tune.rcc") @@ -38,10 +38,10 @@ test_that("tune.rcc works in parallel same as in series", code = { Y <- nutrimouse$gene # run in series - tune.rcc.res <- tune.rcc(X, Y, validation = "Mfold", plot = FALSE, + tune.rcc.res <- tune.rcc(X, Y, validation = "Mfold", BPPARAM = SerialParam(RNGseed = NULL), seed = 12) # run in parallel - tune.rcc.res.parallel <- tune.rcc(X, Y, validation = "Mfold", plot = FALSE, + tune.rcc.res.parallel <- tune.rcc(X, Y, validation = "Mfold", BPPARAM = SnowParam(workers = 2, RNGseed = NULL), seed = 12) # check outputs @@ -58,7 +58,7 @@ test_that("tune.rcc and tune(method='rcc') are equivalent", { Y <- nutrimouse$gene # run independently - tune.rcc.res.1 <- tune.rcc(X, Y, validation = "Mfold", plot = FALSE, + tune.rcc.res.1 <- tune.rcc(X, Y, validation = "Mfold", BPPARAM = SerialParam(RNGseed = NULL), seed = 12, grid1 = c(0.001, 0.2, 0.6, 1), grid2 = c(0.001, 0.2, 0.6, 1)) From d446918bc8b5109b4ae21f1f8b93fda26d3ea87f Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Tue, 26 Nov 2024 14:41:57 +1100 Subject: [PATCH 31/75] hardcode limQ2 param in tune.pls and tune.spls to streamline functions --- R/tune.pls.R | 4 +++- R/tune.spls.R | 5 +++-- man/tune.pls.Rd | 5 ++--- man/tune.spls.Rd | 3 --- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/R/tune.pls.R b/R/tune.pls.R index 8b21c7d8..010fdc4b 100644 --- a/R/tune.pls.R +++ b/R/tune.pls.R @@ -162,7 +162,6 @@ tune.pls <- BPPARAM = SerialParam(), seed = NULL, progressBar = FALSE, - limQ2 = 0.0975, ... ) { @@ -172,6 +171,9 @@ tune.pls <- mode <- match.arg(mode) BPPARAM$RNGseed <- seed + + # hardcode to streamline + limQ2 <- 0.0975 X <- .check_numeric_matrix(X, block_name = 'X') Y <- .check_numeric_matrix(Y, block_name = 'Y') diff --git a/R/tune.spls.R b/R/tune.spls.R index 440569b8..e18fb8b0 100644 --- a/R/tune.spls.R +++ b/R/tune.spls.R @@ -69,7 +69,6 @@ #' @template arg/BPPARAM #' @param seed set a number here if you want the function to give reproducible outputs. #' Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'. -#' @param limQ2 Q2 threshold for recommending optimal \code{ncomp}. #' @param ... Optional parameters passed to \code{\link{spls}} #' @return #' If \code{test.keepX != NULL} and \code{test.keepY != NULL} returns a list that contains: @@ -179,7 +178,6 @@ tune.spls <- BPPARAM = SerialParam(), seed = NULL, progressBar = FALSE, - limQ2 = 0.0975, ... ) { @@ -189,6 +187,9 @@ tune.spls <- mode <- match.arg(mode) BPPARAM$RNGseed <- seed + + # hardcode to streamline + limQ2 <- 0.0975 X <- .check_numeric_matrix(X, block_name = 'X') Y <- .check_numeric_matrix(Y, block_name = 'Y') diff --git a/man/tune.pls.Rd b/man/tune.pls.Rd index 91cdc27d..a133f3a0 100644 --- a/man/tune.pls.Rd +++ b/man/tune.pls.Rd @@ -22,7 +22,6 @@ tune.pls( BPPARAM = SerialParam(), seed = NULL, progressBar = FALSE, - limQ2 = 0.0975, ... ) } @@ -58,8 +57,6 @@ Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM' \item{progressBar}{Logical. If \code{TRUE} a progress bar is shown as the computation completes. Default to \code{FALSE}.} -\item{limQ2}{Q2 threshold for recommending optimal \code{ncomp}.} - \item{...}{Optional parameters passed to \code{\link{spls}}} \item{arg/scale}{Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE} @@ -73,6 +70,8 @@ computation completes. Default to \code{FALSE}.} \item{arg/logratio}{Character, one of ('none','CLR') specifies the log ratio transformation to deal with compositional values that may arise from specific normalisation in sequencing data. Default to 'none'. See ?logratio.transfo for details.} \item{arg/multilevel}{Numeric, design matrix for repeated measurement analysis, where multilevel decomposition is required. For a one factor decomposition, the repeated measures on each individual, i.e. the individuals ID is input as the first column. For a 2 level factor decomposition then 2nd AND 3rd columns indicate those factors. See examples.} + +\item{limQ2}{Q2 threshold for recommending optimal \code{ncomp}.} } \value{ Returns a list with the following components for every repeat: diff --git a/man/tune.spls.Rd b/man/tune.spls.Rd index e0f52969..97a0267d 100644 --- a/man/tune.spls.Rd +++ b/man/tune.spls.Rd @@ -24,7 +24,6 @@ tune.spls( BPPARAM = SerialParam(), seed = NULL, progressBar = FALSE, - limQ2 = 0.0975, ... ) } @@ -66,8 +65,6 @@ Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM' \item{progressBar}{Logical. If \code{TRUE} a progress bar is shown as the computation completes. Default to \code{FALSE}.} -\item{limQ2}{Q2 threshold for recommending optimal \code{ncomp}.} - \item{...}{Optional parameters passed to \code{\link{spls}}} \item{arg/scale}{Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE} From d0a31e28462c42f146b925da814a8938bc81406f Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Tue, 26 Nov 2024 15:05:57 +1100 Subject: [PATCH 32/75] clean up input params docs --- R/tune.spls.R | 10 +++---- R/tune.splsda.R | 20 +++++++------- man/tune.spls.Rd | 14 +++++----- man/tune.splsda.Rd | 68 +++++++++++++++++++++++----------------------- 4 files changed, 56 insertions(+), 56 deletions(-) diff --git a/R/tune.spls.R b/R/tune.spls.R index e18fb8b0..ec66e88a 100644 --- a/R/tune.spls.R +++ b/R/tune.spls.R @@ -161,11 +161,6 @@ tune.spls <- test.keepX = NULL, test.keepY = NULL, ncomp, - # params related to CV - validation = c('Mfold', 'loo'), - nrepeat = 1, - folds, - measure = NULL, # params related to spls model building mode = c('regression', 'canonical', 'classic'), scale = TRUE, @@ -174,6 +169,11 @@ tune.spls <- max.iter = 100, near.zero.var = FALSE, multilevel = NULL, + # params related to CV + validation = c('Mfold', 'loo'), + nrepeat = 1, + folds, + measure = NULL, # params related to running tune BPPARAM = SerialParam(), seed = NULL, diff --git a/R/tune.splsda.R b/R/tune.splsda.R index 3aec4555..0a32bcb4 100644 --- a/R/tune.splsda.R +++ b/R/tune.splsda.R @@ -164,21 +164,21 @@ tune.splsda <- ncomp = 1, test.keepX = NULL, # in which case perf() is run internally to only tune ncomp already.tested.X, - validation = "Mfold", - folds = 10, - dist = NULL, - measure = "BER", # one of c("overall","BER") - scale = TRUE, - auc = FALSE, - progressBar = FALSE, - tol = 1e-06, max.iter = 100, + tol = 1e-06, + scale = TRUE, near.zero.var = FALSE, - nrepeat = 1, logratio = c('none','CLR'), multilevel = NULL, - light.output = TRUE, + validation = "Mfold", + folds = 10, + nrepeat = 1, signif.threshold = 0.01, + dist = NULL, + measure = "BER", # one of c("overall","BER","AUC") + auc = FALSE, + light.output = TRUE, + progressBar = FALSE, BPPARAM = SerialParam(), seed = NULL ) diff --git a/man/tune.spls.Rd b/man/tune.spls.Rd index 97a0267d..60e1592b 100644 --- a/man/tune.spls.Rd +++ b/man/tune.spls.Rd @@ -10,10 +10,6 @@ tune.spls( test.keepX = NULL, test.keepY = NULL, ncomp, - validation = c("Mfold", "loo"), - nrepeat = 1, - folds, - measure = NULL, mode = c("regression", "canonical", "classic"), scale = TRUE, logratio = "none", @@ -21,6 +17,10 @@ tune.spls( max.iter = 100, near.zero.var = FALSE, multilevel = NULL, + validation = c("Mfold", "loo"), + nrepeat = 1, + folds, + measure = NULL, BPPARAM = SerialParam(), seed = NULL, progressBar = FALSE, @@ -41,6 +41,9 @@ test from the \eqn{Y} data set. Default to \code{ncol(Y)}.} \item{ncomp}{Positive Integer. The number of components to include in the model. Default to 2.} +\item{mode}{Character string indicating the type of PLS algorithm to use. One +of \code{"regression"}, \code{"canonical"}, \code{"invariant"} or \code{"classic"}. See Details.} + \item{validation}{character. What kind of (internal) validation to use, matching one of \code{"Mfold"} or \code{"loo"} (Leave-One-out). Default is \code{"Mfold"}.} @@ -53,9 +56,6 @@ details.} \item{measure}{The tuning measure to use. Cannot be NULL when applied to sPLS1 object. See details.} -\item{mode}{Character string indicating the type of PLS algorithm to use. One -of \code{"regression"}, \code{"canonical"}, \code{"invariant"} or \code{"classic"}. See Details.} - \item{BPPARAM}{A \linkS4class{BiocParallelParam} object indicating the type of parallelisation. See examples in \code{?tune.spca}.} diff --git a/man/tune.splsda.Rd b/man/tune.splsda.Rd index 465cbfa2..3bbb3463 100644 --- a/man/tune.splsda.Rd +++ b/man/tune.splsda.Rd @@ -10,21 +10,21 @@ tune.splsda( ncomp = 1, test.keepX = NULL, already.tested.X, + max.iter = 100, + tol = 1e-06, + scale = TRUE, + near.zero.var = FALSE, + logratio = c("none", "CLR"), + multilevel = NULL, validation = "Mfold", folds = 10, + nrepeat = 1, + signif.threshold = 0.01, dist = NULL, measure = "BER", - scale = TRUE, auc = FALSE, - progressBar = FALSE, - tol = 1e-06, - max.iter = 100, - near.zero.var = FALSE, - nrepeat = 1, - logratio = c("none", "CLR"), - multilevel = NULL, light.output = TRUE, - signif.threshold = 0.01, + progressBar = FALSE, BPPARAM = SerialParam(), seed = NULL ) @@ -45,12 +45,35 @@ using all variables in the \eqn{X} data set.} indicating the number of variables to select from the \eqn{X} data set on the firsts components.} +\item{max.iter}{integer, the maximum number of iterations.} + +\item{tol}{Convergence stopping value.} + +\item{scale}{Logical. If scale = TRUE, each block is standardized to zero +means and unit variances (default: TRUE)} + +\item{near.zero.var}{Logical, see the internal \code{\link{nearZeroVar}} +function (should be set to TRUE in particular for data with many zero +values). Default value is FALSE} + +\item{logratio}{one of ('none','CLR'). Default to 'none'} + +\item{multilevel}{Design matrix for multilevel analysis (for repeated +measurements) that indicates the repeated measures on each individual, i.e. +the individuals ID. See Details.} + \item{validation}{character. What kind of (internal) validation to use, matching one of \code{"Mfold"} or \code{"loo"} (short for 'leave-one-out'). Default is \code{"Mfold"}.} \item{folds}{the folds in the Mfold cross-validation. See Details.} +\item{nrepeat}{Number of times the Cross-Validation process is repeated.} + +\item{signif.threshold}{numeric between 0 and 1 indicating the significance +threshold required for improvement in error rate of the components. Default +to 0.01.} + \item{dist}{distance metric to use for \code{splsda} to estimate the classification error rate, should be a subset of \code{"centroids.dist"}, \code{"mahalanobis.dist"} or \code{"max.dist"} (see Details).} @@ -59,37 +82,14 @@ classification error rate, should be a subset of \code{"centroids.dist"}, misclassification error \code{overall}, the Balanced Error Rate \code{BER} or the Area Under the Curve \code{AUC}} -\item{scale}{Logical. If scale = TRUE, each block is standardized to zero -means and unit variances (default: TRUE)} - \item{auc}{if \code{TRUE} calculate the Area Under the Curve (AUC) performance of the model based on the optimisation measure \code{measure}.} -\item{progressBar}{by default set to \code{TRUE} to output the progress bar -of the computation.} - -\item{tol}{Convergence stopping value.} - -\item{max.iter}{integer, the maximum number of iterations.} - -\item{near.zero.var}{Logical, see the internal \code{\link{nearZeroVar}} -function (should be set to TRUE in particular for data with many zero -values). Default value is FALSE} - -\item{nrepeat}{Number of times the Cross-Validation process is repeated.} - -\item{logratio}{one of ('none','CLR'). Default to 'none'} - -\item{multilevel}{Design matrix for multilevel analysis (for repeated -measurements) that indicates the repeated measures on each individual, i.e. -the individuals ID. See Details.} - \item{light.output}{if set to FALSE, the prediction/classification of each sample for each of \code{test.keepX} and each comp is returned.} -\item{signif.threshold}{numeric between 0 and 1 indicating the significance -threshold required for improvement in error rate of the components. Default -to 0.01.} +\item{progressBar}{by default set to \code{TRUE} to output the progress bar +of the computation.} \item{BPPARAM}{A \linkS4class{BiocParallelParam} object indicating the type of parallelisation. See examples in \code{?tune.spca}.} From 416d606acbfb0a1c0653889b32fa450e7403aec4 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Tue, 26 Nov 2024 15:31:35 +1100 Subject: [PATCH 33/75] streamline tune.block.plsda, no measure arg --- R/tune.block.plsda.R | 6 ----- examples/tune.block.plsda-examples.R | 30 +++++++++++++------------ man/tune.block.plsda.Rd | 33 ++++++++++++++-------------- 3 files changed, 32 insertions(+), 37 deletions(-) diff --git a/R/tune.block.plsda.R b/R/tune.block.plsda.R index aea416f9..9be29661 100644 --- a/R/tune.block.plsda.R +++ b/R/tune.block.plsda.R @@ -118,7 +118,6 @@ tune.block.plsda <- signif.threshold=0.01, # measure of performance params dist = "max.dist", - measure = "BER", # one of c("overall","BER") weighted = TRUE, # optimise the weighted or not-weighted prediction # processing params progressBar = FALSE, @@ -213,11 +212,6 @@ tune.block.plsda <- nrepeat = 1 } - #-- measure - measure.input = measure - if (!measure %in% c("overall", "BER")) - stop("'measure' must be 'overall' or 'BER'") - #-- check significance threshold signif.threshold <- .check_alpha(signif.threshold) diff --git a/examples/tune.block.plsda-examples.R b/examples/tune.block.plsda-examples.R index 6dae25d5..c84e9f5f 100644 --- a/examples/tune.block.plsda-examples.R +++ b/examples/tune.block.plsda-examples.R @@ -1,6 +1,3 @@ -## Set up data - -# load data data("breast.TCGA") # X data - list of mRNA and miRNA @@ -18,20 +15,19 @@ Y <- Y[subset] # set up a full design where every block is connected # could also consider other weights, see our mixOmics manuscript -design = matrix(1, ncol = length(data), nrow = length(data), - dimnames = list(names(data), names(data))) +design = matrix(1, ncol = length(X), nrow = length(X), + dimnames = list(names(X), names(X))) diag(design) = 0 design -## Tune number of components to keep +## Tune number of components to keep - use all distance metrics tune_res <- tune.block.plsda(X, Y, design = design, - ncomp = 5, - nrepeat = 3, - seed = 13, - dist = c("max.dist", "centroids.dist")) + ncomp = 5, + nrepeat = 3, + seed = 13, + dist = c("all")) plot(tune_res) - tune_res$choice.ncomp # 3 components best for max.dist, 1 for centroids.dist @@ -40,8 +36,14 @@ tune_res <- tune.block.plsda(X, Y, design = design, ncomp = 5, nrepeat = 3, seed = 13, - dist = c("max.dist", "centroids.dist"), + dist = c("all"), weighted = FALSE) -plot(tune_res) +tune_res$weights -tune_res$weights \ No newline at end of file +## Tune number of components to keep - plot just max.dist +tune_res <- tune.block.plsda(X, Y, design = design, + ncomp = 5, + nrepeat = 3, + seed = 13, + dist = c("max.dist")) +plot(tune_res) \ No newline at end of file diff --git a/man/tune.block.plsda.Rd b/man/tune.block.plsda.Rd index 1c7c5533..f77e8f05 100644 --- a/man/tune.block.plsda.Rd +++ b/man/tune.block.plsda.Rd @@ -19,7 +19,6 @@ tune.block.plsda( nrepeat = 1, signif.threshold = 0.01, dist = "max.dist", - measure = "BER", weighted = TRUE, progressBar = FALSE, light.output = TRUE, @@ -80,8 +79,6 @@ to 0.01.} classification error rate, should be a subset of \code{"centroids.dist"}, \code{"mahalanobis.dist"} or \code{"max.dist"} (see Details).} -\item{measure}{The tuning measure used for different methods. See details.} - \item{weighted}{tune using either the performance of the Majority vote or the Weighted vote.} @@ -174,9 +171,6 @@ class, weighted by the number of samples in each class. BER is less biased towards majority classes during the performance assessment. } \examples{ -## Set up data - -# load data data("breast.TCGA") # X data - list of mRNA and miRNA @@ -194,20 +188,19 @@ Y <- Y[subset] # set up a full design where every block is connected # could also consider other weights, see our mixOmics manuscript -design = matrix(1, ncol = length(data), nrow = length(data), - dimnames = list(names(data), names(data))) +design = matrix(1, ncol = length(X), nrow = length(X), + dimnames = list(names(X), names(X))) diag(design) = 0 design -## Tune number of components to keep +## Tune number of components to keep - use all distance metrics tune_res <- tune.block.plsda(X, Y, design = design, - ncomp = 5, - nrepeat = 3, - seed = 13, - dist = c("max.dist", "centroids.dist")) + ncomp = 5, + nrepeat = 3, + seed = 13, + dist = c("all")) plot(tune_res) - tune_res$choice.ncomp # 3 components best for max.dist, 1 for centroids.dist @@ -216,11 +209,17 @@ tune_res <- tune.block.plsda(X, Y, design = design, ncomp = 5, nrepeat = 3, seed = 13, - dist = c("max.dist", "centroids.dist"), + dist = c("all"), weighted = FALSE) -plot(tune_res) - tune_res$weights + +## Tune number of components to keep - plot just max.dist +tune_res <- tune.block.plsda(X, Y, design = design, + ncomp = 5, + nrepeat = 3, + seed = 13, + dist = c("max.dist")) +plot(tune_res) } \references{ Method: From 9d706d50c38b15017e809fdd06f1f92cdf2437d5 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Tue, 26 Nov 2024 15:46:55 +1100 Subject: [PATCH 34/75] update tune.block.splsda docs to reflect how to use 'dist' and 'measure' --- R/tune.block.splsda.R | 15 +++------ examples/tune.block.splsda-examples.R | 25 ++++++++------- man/tune.block.splsda.Rd | 45 +++++++++++---------------- 3 files changed, 38 insertions(+), 47 deletions(-) diff --git a/R/tune.block.splsda.R b/R/tune.block.splsda.R index f7d72380..f162b04e 100644 --- a/R/tune.block.splsda.R +++ b/R/tune.block.splsda.R @@ -44,21 +44,16 @@ #' numeric vectors each of length \code{n_tested} indicating the number of #' variables to select from the \eqn{X} data set on the first \code{n_tested} #' components. +#' @param measure only used when \code{test.keepX} is not NULL. Measure used when plotting, +#' should be 'BER' or 'overall' +#' @param dist distance metric to estimate the classification error rate, should be one of +#' "centroids.dist", "mahalanobis.dist" or "max.dist" (see Details). If \code{test.keepX = NULL}, +#' can also input "all" or more than one distance metric #' @param weighted tune using either the performance of the Majority vote or #' the Weighted vote. #' @param signif.threshold numeric between 0 and 1 indicating the significance #' threshold required for improvement in error rate of the components. Default #' to 0.01. -#' @param ... Optional arguments: -#' \itemize{ -#' \item \bold{seed} Integer. Seed number for reproducible parallel code. -#' Default is \code{NULL}. -#' } -#' run in parallel when repeating the cross-validation, which is usually the -#' most computationally intensive process. If there is excess CPU, the -#' cross-vaidation is also parallelised on *nix-based OS which support -#' \code{mclapply}. -#' Note that the argument 'scheme' has now been hardcoded to 'horst' and 'init' to 'svd.single'. #' @return A list that contains: \item{error.rate}{returns the prediction error #' for each \code{test.keepX} on each component, averaged across all repeats #' and subsampling folds. Standard deviation is also output. All error rates diff --git a/examples/tune.block.splsda-examples.R b/examples/tune.block.splsda-examples.R index 251b67e2..dd60ebce 100644 --- a/examples/tune.block.splsda-examples.R +++ b/examples/tune.block.splsda-examples.R @@ -18,8 +18,8 @@ Y <- Y[subset] # set up a full design where every block is connected # could also consider other weights, see our mixOmics manuscript -design = matrix(1, ncol = length(data), nrow = length(data), - dimnames = list(names(data), names(data))) +design = matrix(1, ncol = length(X), nrow = length(X), + dimnames = list(names(X), names(X))) diag(design) = 0 design @@ -27,7 +27,8 @@ design tune_res <- tune.block.splsda(X, Y, design = design, ncomp = 5, test.keepX = NULL, - nrepeat = 3, + validation = "Mfold", nrepeat = 3, + dist = "all", measure = "BER", seed = 13) plot(tune_res) @@ -43,13 +44,13 @@ test.keepX = list(mrna = c(10, 30), mirna = c(15, 25), protein = c(4, 8)) # load parallel package library(BiocParallel) -# run tuning in parallel on 2 cores +# run tuning in parallel on 2 cores, output plot on overall error tune_res <- tune.block.splsda(X, Y, design = design, ncomp = 2, test.keepX = test.keepX, - nrepeat = 3, - seed = 13, - BPPARAM = SnowParam(workers = 2)) + validation = "Mfold", nrepeat = 3, + measure = "overall", + seed = 13, BPPARAM = SnowParam(workers = 2)) plot(tune_res) tune_res$choice.keepX @@ -57,8 +58,10 @@ tune_res$choice.keepX # Now tuning a new component given previous tuned keepX already.tested.X <- tune_res$choice.keepX tune_res <- tune.block.splsda(X, Y, design = design, - ncomp = 3, - test.keepX = test.keepX, - already.tested.X = already.tested.X, - BPPARAM = SnowParam(workers = 2)) + ncomp = 3, + test.keepX = test.keepX, + validation = "Mfold", nrepeat = 3, + measure = "overall", + seed = 13, BPPARAM = SnowParam(workers = 2), + already.tested.X = already.tested.X) tune_res$choice.keepX \ No newline at end of file diff --git a/man/tune.block.splsda.Rd b/man/tune.block.splsda.Rd index a43e0489..3294a683 100644 --- a/man/tune.block.splsda.Rd +++ b/man/tune.block.splsda.Rd @@ -89,11 +89,12 @@ matching one of \code{"Mfold"} or \code{"loo"} (see below). Default is threshold required for improvement in error rate of the components. Default to 0.01.} -\item{dist}{distance metric to estimate the -classification error rate, should be a subset of \code{"centroids.dist"}, -\code{"mahalanobis.dist"} or \code{"max.dist"} (see Details).} +\item{dist}{distance metric to estimate the classification error rate, should be one of +"centroids.dist", "mahalanobis.dist" or "max.dist" (see Details). If \code{test.keepX = NULL}, +can also input "all" or more than one distance metric} -\item{measure}{The tuning measure used for different methods. See details.} +\item{measure}{only used when \code{test.keepX} is not NULL. Measure used when plotting, +should be 'BER' or 'overall'} \item{weighted}{tune using either the performance of the Majority vote or the Weighted vote.} @@ -109,17 +110,6 @@ of parallelisation. See examples.} \item{seed}{set a number here if you want the function to give reproducible outputs. Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'.} - -\item{...}{Optional arguments: -\itemize{ - \item \bold{seed} Integer. Seed number for reproducible parallel code. - Default is \code{NULL}. -} -run in parallel when repeating the cross-validation, which is usually the -most computationally intensive process. If there is excess CPU, the -cross-vaidation is also parallelised on *nix-based OS which support -\code{mclapply}. -Note that the argument 'scheme' has now been hardcoded to 'horst' and 'init' to 'svd.single'.} } \value{ A list that contains: \item{error.rate}{returns the prediction error @@ -230,8 +220,8 @@ Y <- Y[subset] # set up a full design where every block is connected # could also consider other weights, see our mixOmics manuscript -design = matrix(1, ncol = length(data), nrow = length(data), - dimnames = list(names(data), names(data))) +design = matrix(1, ncol = length(X), nrow = length(X), + dimnames = list(names(X), names(X))) diag(design) = 0 design @@ -239,7 +229,8 @@ design tune_res <- tune.block.splsda(X, Y, design = design, ncomp = 5, test.keepX = NULL, - nrepeat = 3, + validation = "Mfold", nrepeat = 3, + dist = "all", measure = "BER", seed = 13) plot(tune_res) @@ -255,13 +246,13 @@ test.keepX = list(mrna = c(10, 30), mirna = c(15, 25), protein = c(4, 8)) # load parallel package library(BiocParallel) -# run tuning in parallel on 2 cores +# run tuning in parallel on 2 cores, output plot on overall error tune_res <- tune.block.splsda(X, Y, design = design, ncomp = 2, test.keepX = test.keepX, - nrepeat = 3, - seed = 13, - BPPARAM = SnowParam(workers = 2)) + validation = "Mfold", nrepeat = 3, + measure = "overall", + seed = 13, BPPARAM = SnowParam(workers = 2)) plot(tune_res) tune_res$choice.keepX @@ -269,10 +260,12 @@ tune_res$choice.keepX # Now tuning a new component given previous tuned keepX already.tested.X <- tune_res$choice.keepX tune_res <- tune.block.splsda(X, Y, design = design, - ncomp = 3, - test.keepX = test.keepX, - already.tested.X = already.tested.X, - BPPARAM = SnowParam(workers = 2)) + ncomp = 3, + test.keepX = test.keepX, + validation = "Mfold", nrepeat = 3, + measure = "overall", + seed = 13, BPPARAM = SnowParam(workers = 2), + already.tested.X = already.tested.X) tune_res$choice.keepX } \references{ From 3e6c4d808fbd05e2b4188aed68155c524dc73f7f Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Tue, 26 Nov 2024 15:50:26 +1100 Subject: [PATCH 35/75] update dist default to 'all' for tune.block.plsda --- R/tune.block.plsda.R | 4 +++- man/tune.block.plsda.Rd | 7 +++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/R/tune.block.plsda.R b/R/tune.block.plsda.R index 9be29661..80998fd2 100644 --- a/R/tune.block.plsda.R +++ b/R/tune.block.plsda.R @@ -29,6 +29,8 @@ #' #' @inheritParams block.plsda #' @inheritParams tune +#' @param dist Distance metric. Should be a subset of "max.dist", "centroids.dist", "mahalanobis.dist" or "all". +#' Default is "all" #' @param weighted tune using either the performance of the Majority vote or #' the Weighted vote. #' @param signif.threshold numeric between 0 and 1 indicating the significance @@ -117,7 +119,7 @@ tune.block.plsda <- nrepeat = 1, signif.threshold=0.01, # measure of performance params - dist = "max.dist", + dist = "all", weighted = TRUE, # optimise the weighted or not-weighted prediction # processing params progressBar = FALSE, diff --git a/man/tune.block.plsda.Rd b/man/tune.block.plsda.Rd index f77e8f05..d2401355 100644 --- a/man/tune.block.plsda.Rd +++ b/man/tune.block.plsda.Rd @@ -18,7 +18,7 @@ tune.block.plsda( folds = 10, nrepeat = 1, signif.threshold = 0.01, - dist = "max.dist", + dist = "all", weighted = TRUE, progressBar = FALSE, light.output = TRUE, @@ -75,9 +75,8 @@ matching one of \code{"Mfold"} or \code{"loo"} (see below). Default is threshold required for improvement in error rate of the components. Default to 0.01.} -\item{dist}{distance metric to estimate the -classification error rate, should be a subset of \code{"centroids.dist"}, -\code{"mahalanobis.dist"} or \code{"max.dist"} (see Details).} +\item{dist}{Distance metric. Should be a subset of "max.dist", "centroids.dist", "mahalanobis.dist" or "all". +Default is "all"} \item{weighted}{tune using either the performance of the Majority vote or the Weighted vote.} From bf1ffc067edd24cc4fd98be16658f251388f2c9f Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Tue, 26 Nov 2024 15:54:02 +1100 Subject: [PATCH 36/75] streamline tune.plsda - no 'measure' arg --- R/tune.plsda.R | 27 ++++++++------------------- man/tune.plsda.Rd | 23 +++++------------------ 2 files changed, 13 insertions(+), 37 deletions(-) diff --git a/R/tune.plsda.R b/R/tune.plsda.R index 801c1235..1fd6e294 100644 --- a/R/tune.plsda.R +++ b/R/tune.plsda.R @@ -49,15 +49,10 @@ #' If \code{auc = TRUE} and there are more than 2 categories in \code{Y}, the #' Area Under the Curve is averaged using one-vs-all comparison. Note however #' that the AUC criteria may not be particularly insightful as the prediction -#' threshold we use in sPLS-DA differs from an AUC threshold (PLS-DA relies on +#' threshold we use in PLS-DA differs from an AUC threshold (PLS-DA relies on #' prediction distances for predictions, see \code{?predict.plsda} for more #' details) and the supplemental material of the mixOmics article (Rohart et -#' al. 2017). If you want the AUC criterion to be insightful, you should use -#' \code{measure==AUC} as this will output the number of variable that -#' maximises the AUC; in this case there is no prediction threshold from -#' PLS-DA (\code{dist} is not used). If \code{measure==AUC}, we do not output -#' SD as this measure can be a mean (over \code{nrepeat}) of means (over the -#' categories). +#' al. 2017). #' #' BER is appropriate in case of an unbalanced number of samples per class as #' it calculates the average proportion of wrongly classified samples in each @@ -81,13 +76,9 @@ #' @param dist distance metric to use for \code{splsda} to estimate the #' classification error rate, should be a subset of \code{"centroids.dist"}, #' \code{"mahalanobis.dist"} or \code{"max.dist"} (see Details). -#' @param measure Three misclassification measure are available: overall -#' misclassification error \code{overall}, the Balanced Error Rate \code{BER} -#' or the Area Under the Curve \code{AUC} #' @param scale Logical. If scale = TRUE, each block is standardized to zero #' means and unit variances (default: TRUE) #' @param auc if \code{TRUE} calculate the Area Under the Curve (AUC) -#' performance of the model based on the optimisation measure \code{measure}. #' @param progressBar by default set to \code{TRUE} to output the progress bar #' of the computation. #' @param tol Convergence stopping value. @@ -105,6 +96,7 @@ #' @param signif.threshold numeric between 0 and 1 indicating the significance #' threshold required for improvement in error rate of the components. Default #' to 0.01. +#' @param dist Distance metric. Should be a subset of "max.dist", "centroids.dist", "mahalanobis.dist" or "all". Default is "all" #' @template arg/BPPARAM #' @param seed set a number here if you want the function to give reproducible outputs. #' Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'. @@ -136,8 +128,7 @@ tune.plsda <- ncomp = 1, validation = "Mfold", folds = 10, - dist = NULL, - measure = "BER", # one of c("overall","BER") + dist = "all", scale = TRUE, auc = FALSE, progressBar = FALSE, @@ -233,15 +224,13 @@ tune.plsda <- #-- logratio logratio <- match.arg(logratio) - #-- measure - choices = c("BER", "overall","AUC") - measure = choices[pmatch(measure, choices)] - if (is.na(measure)) - stop("'measure' must be either 'BER', 'overall' or 'AUC' ") - + #-- validation if (any(is.na(validation)) || length(validation) > 1) stop("'validation' should be one of 'Mfold' or 'loo'.", call. = FALSE) + + #------------------# + #-- run perf --# plsda_res <- plsda(X, Y, ncomp, scale = scale, tol = tol, max.iter = max.iter, near.zero.var = near.zero.var, logratio = logratio, multilevel = multilevel) perf_res <- perf(plsda_res, diff --git a/man/tune.plsda.Rd b/man/tune.plsda.Rd index 8b29a26d..f87974be 100644 --- a/man/tune.plsda.Rd +++ b/man/tune.plsda.Rd @@ -10,8 +10,7 @@ tune.plsda( ncomp = 1, validation = "Mfold", folds = 10, - dist = NULL, - measure = "BER", + dist = "all", scale = TRUE, auc = FALSE, progressBar = FALSE, @@ -41,19 +40,12 @@ Default is \code{"Mfold"}.} \item{folds}{the folds in the Mfold cross-validation. See Details.} -\item{dist}{distance metric to use for \code{splsda} to estimate the -classification error rate, should be a subset of \code{"centroids.dist"}, -\code{"mahalanobis.dist"} or \code{"max.dist"} (see Details).} - -\item{measure}{Three misclassification measure are available: overall -misclassification error \code{overall}, the Balanced Error Rate \code{BER} -or the Area Under the Curve \code{AUC}} +\item{dist}{Distance metric. Should be a subset of "max.dist", "centroids.dist", "mahalanobis.dist" or "all". Default is "all"} \item{scale}{Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE)} -\item{auc}{if \code{TRUE} calculate the Area Under the Curve (AUC) -performance of the model based on the optimisation measure \code{measure}.} +\item{auc}{if \code{TRUE} calculate the Area Under the Curve (AUC)} \item{progressBar}{by default set to \code{TRUE} to output the progress bar of the computation.} @@ -144,15 +136,10 @@ many folds to generate is selected by specifying the number of folds in If \code{auc = TRUE} and there are more than 2 categories in \code{Y}, the Area Under the Curve is averaged using one-vs-all comparison. Note however that the AUC criteria may not be particularly insightful as the prediction -threshold we use in sPLS-DA differs from an AUC threshold (PLS-DA relies on +threshold we use in PLS-DA differs from an AUC threshold (PLS-DA relies on prediction distances for predictions, see \code{?predict.plsda} for more details) and the supplemental material of the mixOmics article (Rohart et -al. 2017). If you want the AUC criterion to be insightful, you should use -\code{measure==AUC} as this will output the number of variable that -maximises the AUC; in this case there is no prediction threshold from -PLS-DA (\code{dist} is not used). If \code{measure==AUC}, we do not output -SD as this measure can be a mean (over \code{nrepeat}) of means (over the -categories). +al. 2017). BER is appropriate in case of an unbalanced number of samples per class as it calculates the average proportion of wrongly classified samples in each From 80961a765ac8fa5cddfa8f3dd8747cd907424da0 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Tue, 26 Nov 2024 17:06:55 +1100 Subject: [PATCH 37/75] update docs for tune.splsda --- R/tune.splsda.R | 7 ++++--- man/tune.splsda.Rd | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/R/tune.splsda.R b/R/tune.splsda.R index 0a32bcb4..283ab5e8 100644 --- a/R/tune.splsda.R +++ b/R/tune.splsda.R @@ -89,11 +89,12 @@ #' Default is \code{"Mfold"}. #' @param folds the folds in the Mfold cross-validation. See Details. #' @param dist distance metric to use for \code{splsda} to estimate the -#' classification error rate, should be a subset of \code{"centroids.dist"}, -#' \code{"mahalanobis.dist"} or \code{"max.dist"} (see Details). +#' classification error rate, should one of \code{"centroids.dist"}, +#' \code{"mahalanobis.dist"} or \code{"max.dist"} (see Details). +#' If \code{test.keepX = NULL} multiple distances can be inputted or "all". #' @param measure Three misclassification measure are available: overall #' misclassification error \code{overall}, the Balanced Error Rate \code{BER} -#' or the Area Under the Curve \code{AUC} +#' or the Area Under the Curve \code{AUC}. Only used when \code{test.keepX} is not NULL. #' @param scale Logical. If scale = TRUE, each block is standardized to zero #' means and unit variances (default: TRUE) #' @param auc if \code{TRUE} calculate the Area Under the Curve (AUC) diff --git a/man/tune.splsda.Rd b/man/tune.splsda.Rd index 3bbb3463..18b95bb1 100644 --- a/man/tune.splsda.Rd +++ b/man/tune.splsda.Rd @@ -75,12 +75,13 @@ threshold required for improvement in error rate of the components. Default to 0.01.} \item{dist}{distance metric to use for \code{splsda} to estimate the -classification error rate, should be a subset of \code{"centroids.dist"}, -\code{"mahalanobis.dist"} or \code{"max.dist"} (see Details).} +classification error rate, should one of \code{"centroids.dist"}, +\code{"mahalanobis.dist"} or \code{"max.dist"} (see Details). +If \code{test.keepX = NULL} multiple distances can be inputted or "all".} \item{measure}{Three misclassification measure are available: overall misclassification error \code{overall}, the Balanced Error Rate \code{BER} -or the Area Under the Curve \code{AUC}} +or the Area Under the Curve \code{AUC}. Only used when \code{test.keepX} is not NULL.} \item{auc}{if \code{TRUE} calculate the Area Under the Curve (AUC) performance of the model based on the optimisation measure \code{measure}.} From 005f952bab599c2ad990436530dbac7c5a249865 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 10:07:43 +1100 Subject: [PATCH 38/75] make tune.spls example code lines less long --- examples/tune.spls-examples.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/tune.spls-examples.R b/examples/tune.spls-examples.R index e38b5968..781c2469 100644 --- a/examples/tune.spls-examples.R +++ b/examples/tune.spls-examples.R @@ -25,7 +25,11 @@ plot(tune.res) # plot outputs Y1 <- liver.toxicity$clinic[,1] # tune spls model for components only -plot(tune.spls(X, Y1, ncomp = 3, folds = 3, test.keepX = NULL, test.keepY = NULL)) +plot(tune.spls(X, Y1, ncomp = 3, +folds = 3, +test.keepX = NULL, test.keepY = NULL)) # tune spls model for number of X variables to keep, note for sPLS1 models 'measure' needs to be set -plot(tune.spls(X, Y1, ncomp = 3, folds = 3, test.keepX = c(5, 10, 15), test.keepY = c(3, 6, 8), measure = "MSE")) \ No newline at end of file +plot(tune.spls(X, Y1, ncomp = 3, +folds = 3, measure = "MSE", +test.keepX = c(5, 10, 15), test.keepY = c(3, 6, 8))) \ No newline at end of file From b0f9ab9343922207b98985e36da21a267441687c Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 10:16:07 +1100 Subject: [PATCH 39/75] fix tune.pls docs --- R/tune.pls.R | 20 +++++++++----------- man/tune.pls.Rd | 32 +++++++++++++++----------------- 2 files changed, 24 insertions(+), 28 deletions(-) diff --git a/R/tune.pls.R b/R/tune.pls.R index 010fdc4b..d9f571aa 100644 --- a/R/tune.pls.R +++ b/R/tune.pls.R @@ -56,12 +56,12 @@ #' @param measure The tuning measure to use. Cannot be NULL when applied to sPLS1 object. See details. #' @templateVar modes \code{"regression"}, \code{"canonical"}, \code{"invariant"} or \code{"classic"} #' @template arg/mode -#' @param arg/scale Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE -#' @param arg/tol Positive numeric used as convergence criteria/tolerance during the iterative process. Default to 1e-06. -#' @param arg/max.iter Integer, the maximum number of iterations. Default to 100. -#' @param arg/near.zero.var Logical, see the internal nearZeroVar function (should be set to TRUE in particular for data with many zero values). Setting this argument to FALSE (when appropriate) will speed up the computations. Default value is FALSE. -#' @param arg/logratio Character, one of ('none','CLR') specifies the log ratio transformation to deal with compositional values that may arise from specific normalisation in sequencing data. Default to 'none'. See ?logratio.transfo for details. -#' @param arg/multilevel Numeric, design matrix for repeated measurement analysis, where multilevel decomposition is required. For a one factor decomposition, the repeated measures on each individual, i.e. the individuals ID is input as the first column. For a 2 level factor decomposition then 2nd AND 3rd columns indicate those factors. See examples. +#' @param scale Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE +#' @param tol Positive numeric used as convergence criteria/tolerance during the iterative process. Default to 1e-06. +#' @param max.iter Integer, the maximum number of iterations. Default to 100. +#' @param near.zero.var Logical, see the internal nearZeroVar function (should be set to TRUE in particular for data with many zero values). Setting this argument to FALSE (when appropriate) will speed up the computations. Default value is FALSE. +#' @param logratio Character, one of ('none','CLR') specifies the log ratio transformation to deal with compositional values that may arise from specific normalisation in sequencing data. Default to 'none'. See ?logratio.transfo for details. +#' @param multilevel Numeric, design matrix for repeated measurement analysis, where multilevel decomposition is required. For a one factor decomposition, the repeated measures on each individual, i.e. the individuals ID is input as the first column. For a 2 level factor decomposition then 2nd AND 3rd columns indicate those factors. See examples. #' @template arg/progressBar #' @template arg/BPPARAM #' @param seed set a number here if you want the function to give reproducible outputs. @@ -103,10 +103,8 @@ #' \item{error.rate} #' #' -#' @author Kim-Anh Lê Cao, Al J Abadi, Benoit Gautier, Francois Bartolo, -#' Florian Rohart, -#' @seealso \code{\link{splsda}}, \code{\link{predict.splsda}} and -#' http://www.mixOmics.org for more details. +#' @author Kim-Anh Lê Cao, Al J Abadi, Benoit Gautier, Francois Bartolo and Florian Rohart. +#' @seealso \code{\link{splsda}}, \code{\link{predict.splsda}}, and http://www.mixOmics.org for more details. #' @references mixOmics article: #' #' Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics @@ -124,7 +122,7 @@ #' (MSEP) Estimates for Principal Component Regression (PCR) and Partial Least #' Squares Regression (PLSR). Journal of Chemometrics 18(9), 422-429. #' -#' sparse PLS regression mode: +#' Sparse PLS regression mode: #' #' Lê Cao, K. A., Rossouw D., Robert-Granie, C. and Besse, P. (2008). A sparse #' PLS for variable selection when integrating Omics data. Statistical diff --git a/man/tune.pls.Rd b/man/tune.pls.Rd index a133f3a0..c6dcadc5 100644 --- a/man/tune.pls.Rd +++ b/man/tune.pls.Rd @@ -48,6 +48,18 @@ details.} \item{mode}{Character string indicating the type of PLS algorithm to use. One of \code{"regression"}, \code{"canonical"}, \code{"invariant"} or \code{"classic"}. See Details.} +\item{scale}{Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE} + +\item{logratio}{Character, one of ('none','CLR') specifies the log ratio transformation to deal with compositional values that may arise from specific normalisation in sequencing data. Default to 'none'. See ?logratio.transfo for details.} + +\item{tol}{Positive numeric used as convergence criteria/tolerance during the iterative process. Default to 1e-06.} + +\item{max.iter}{Integer, the maximum number of iterations. Default to 100.} + +\item{near.zero.var}{Logical, see the internal nearZeroVar function (should be set to TRUE in particular for data with many zero values). Setting this argument to FALSE (when appropriate) will speed up the computations. Default value is FALSE.} + +\item{multilevel}{Numeric, design matrix for repeated measurement analysis, where multilevel decomposition is required. For a one factor decomposition, the repeated measures on each individual, i.e. the individuals ID is input as the first column. For a 2 level factor decomposition then 2nd AND 3rd columns indicate those factors. See examples.} + \item{BPPARAM}{A \linkS4class{BiocParallelParam} object indicating the type of parallelisation. See examples in \code{?tune.spca}.} @@ -59,18 +71,6 @@ computation completes. Default to \code{FALSE}.} \item{...}{Optional parameters passed to \code{\link{spls}}} -\item{arg/scale}{Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE} - -\item{arg/tol}{Positive numeric used as convergence criteria/tolerance during the iterative process. Default to 1e-06.} - -\item{arg/max.iter}{Integer, the maximum number of iterations. Default to 100.} - -\item{arg/near.zero.var}{Logical, see the internal nearZeroVar function (should be set to TRUE in particular for data with many zero values). Setting this argument to FALSE (when appropriate) will speed up the computations. Default value is FALSE.} - -\item{arg/logratio}{Character, one of ('none','CLR') specifies the log ratio transformation to deal with compositional values that may arise from specific normalisation in sequencing data. Default to 'none'. See ?logratio.transfo for details.} - -\item{arg/multilevel}{Numeric, design matrix for repeated measurement analysis, where multilevel decomposition is required. For a one factor decomposition, the repeated measures on each individual, i.e. the individuals ID is input as the first column. For a 2 level factor decomposition then 2nd AND 3rd columns indicate those factors. See examples.} - \item{limQ2}{Q2 threshold for recommending optimal \code{ncomp}.} } \value{ @@ -213,7 +213,7 @@ Mevik, B.-H., Cederkvist, H. R. (2004). Mean Squared Error of Prediction (MSEP) Estimates for Principal Component Regression (PCR) and Partial Least Squares Regression (PLSR). Journal of Chemometrics 18(9), 422-429. -sparse PLS regression mode: +Sparse PLS regression mode: Lê Cao, K. A., Rossouw D., Robert-Granie, C. and Besse, P. (2008). A sparse PLS for variable selection when integrating Omics data. Statistical @@ -227,12 +227,10 @@ Patel J, Atkinson K, Khosrotehrani K, Fisk NM, Lê Cao K-A&, Wells CA& 4:e1845. } \seealso{ -\code{\link{splsda}}, \code{\link{predict.splsda}} and -http://www.mixOmics.org for more details. +\code{\link{splsda}}, \code{\link{predict.splsda}}, and http://www.mixOmics.org for more details. } \author{ -Kim-Anh Lê Cao, Al J Abadi, Benoit Gautier, Francois Bartolo, -Florian Rohart, +Kim-Anh Lê Cao, Al J Abadi, Benoit Gautier, Francois Bartolo and Florian Rohart. } \keyword{multivariate} \keyword{regression} From eee01cb198af5213cd94adbf372da99915301dd9 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 10:18:44 +1100 Subject: [PATCH 40/75] fix errors in tune.spls docs --- R/tune.spls.R | 12 ++++++------ man/tune.spls.Rd | 32 ++++++++++++++++++-------------- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/R/tune.spls.R b/R/tune.spls.R index ec66e88a..bd08e78c 100644 --- a/R/tune.spls.R +++ b/R/tune.spls.R @@ -59,12 +59,12 @@ #' @param measure The tuning measure to use. Cannot be NULL when applied to sPLS1 object. See details. #' @templateVar modes \code{"regression"}, \code{"canonical"}, \code{"invariant"} or \code{"classic"} #' @template arg/mode -#' @param arg/scale Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE -#' @param arg/tol Positive numeric used as convergence criteria/tolerance during the iterative process. Default to 1e-06. -#' @param arg/max.iter Integer, the maximum number of iterations. Default to 100. -#' @param arg/near.zero.var Logical, see the internal nearZeroVar function (should be set to TRUE in particular for data with many zero values). Setting this argument to FALSE (when appropriate) will speed up the computations. Default value is FALSE. -#' @param arg/logratio Character, one of ('none','CLR') specifies the log ratio transformation to deal with compositional values that may arise from specific normalisation in sequencing data. Default to 'none'. See ?logratio.transfo for details. -#' @param arg/multilevel Numeric, design matrix for repeated measurement analysis, where multilevel decomposition is required. For a one factor decomposition, the repeated measures on each individual, i.e. the individuals ID is input as the first column. For a 2 level factor decomposition then 2nd AND 3rd columns indicate those factors. See examples. +#' @param scale Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE +#' @param tol Positive numeric used as convergence criteria/tolerance during the iterative process. Default to 1e-06. +#' @param max.iter Integer, the maximum number of iterations. Default to 100. +#' @param near.zero.var Logical, see the internal nearZeroVar function (should be set to TRUE in particular for data with many zero values). Setting this argument to FALSE (when appropriate) will speed up the computations. Default value is FALSE. +#' @param logratio Character, one of ('none','CLR') specifies the log ratio transformation to deal with compositional values that may arise from specific normalisation in sequencing data. Default to 'none'. See ?logratio.transfo for details. +#' @param multilevel Numeric, design matrix for repeated measurement analysis, where multilevel decomposition is required. For a one factor decomposition, the repeated measures on each individual, i.e. the individuals ID is input as the first column. For a 2 level factor decomposition then 2nd AND 3rd columns indicate those factors. See examples. #' @template arg/progressBar #' @template arg/BPPARAM #' @param seed set a number here if you want the function to give reproducible outputs. diff --git a/man/tune.spls.Rd b/man/tune.spls.Rd index 60e1592b..c20e22de 100644 --- a/man/tune.spls.Rd +++ b/man/tune.spls.Rd @@ -44,6 +44,18 @@ model. Default to 2.} \item{mode}{Character string indicating the type of PLS algorithm to use. One of \code{"regression"}, \code{"canonical"}, \code{"invariant"} or \code{"classic"}. See Details.} +\item{scale}{Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE} + +\item{logratio}{Character, one of ('none','CLR') specifies the log ratio transformation to deal with compositional values that may arise from specific normalisation in sequencing data. Default to 'none'. See ?logratio.transfo for details.} + +\item{tol}{Positive numeric used as convergence criteria/tolerance during the iterative process. Default to 1e-06.} + +\item{max.iter}{Integer, the maximum number of iterations. Default to 100.} + +\item{near.zero.var}{Logical, see the internal nearZeroVar function (should be set to TRUE in particular for data with many zero values). Setting this argument to FALSE (when appropriate) will speed up the computations. Default value is FALSE.} + +\item{multilevel}{Numeric, design matrix for repeated measurement analysis, where multilevel decomposition is required. For a one factor decomposition, the repeated measures on each individual, i.e. the individuals ID is input as the first column. For a 2 level factor decomposition then 2nd AND 3rd columns indicate those factors. See examples.} + \item{validation}{character. What kind of (internal) validation to use, matching one of \code{"Mfold"} or \code{"loo"} (Leave-One-out). Default is \code{"Mfold"}.} @@ -66,18 +78,6 @@ Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM' computation completes. Default to \code{FALSE}.} \item{...}{Optional parameters passed to \code{\link{spls}}} - -\item{arg/scale}{Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE} - -\item{arg/tol}{Positive numeric used as convergence criteria/tolerance during the iterative process. Default to 1e-06.} - -\item{arg/max.iter}{Integer, the maximum number of iterations. Default to 100.} - -\item{arg/near.zero.var}{Logical, see the internal nearZeroVar function (should be set to TRUE in particular for data with many zero values). Setting this argument to FALSE (when appropriate) will speed up the computations. Default value is FALSE.} - -\item{arg/logratio}{Character, one of ('none','CLR') specifies the log ratio transformation to deal with compositional values that may arise from specific normalisation in sequencing data. Default to 'none'. See ?logratio.transfo for details.} - -\item{arg/multilevel}{Numeric, design matrix for repeated measurement analysis, where multilevel decomposition is required. For a one factor decomposition, the repeated measures on each individual, i.e. the individuals ID is input as the first column. For a 2 level factor decomposition then 2nd AND 3rd columns indicate those factors. See examples.} } \value{ If \code{test.keepX != NULL} and \code{test.keepY != NULL} returns a list that contains: @@ -223,10 +223,14 @@ plot(tune.res) # plot outputs Y1 <- liver.toxicity$clinic[,1] # tune spls model for components only -plot(tune.spls(X, Y1, ncomp = 3, folds = 3, test.keepX = NULL, test.keepY = NULL)) +plot(tune.spls(X, Y1, ncomp = 3, +folds = 3, +test.keepX = NULL, test.keepY = NULL)) # tune spls model for number of X variables to keep, note for sPLS1 models 'measure' needs to be set -plot(tune.spls(X, Y1, ncomp = 3, folds = 3, test.keepX = c(5, 10, 15), test.keepY = c(3, 6, 8), measure = "MSE")) +plot(tune.spls(X, Y1, ncomp = 3, +folds = 3, measure = "MSE", +test.keepX = c(5, 10, 15), test.keepY = c(3, 6, 8))) } \references{ mixOmics article: From bd1610a227573a8fb6bdc6206a876bc24a5c2969 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 10:24:42 +1100 Subject: [PATCH 41/75] allow tune.mint functions to take extra args --- R/tune.mint.plsda.R | 3 ++- R/tune.mint.splsda.R | 3 ++- man/tune.mint.plsda.Rd | 3 ++- man/tune.mint.splsda.Rd | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/R/tune.mint.plsda.R b/R/tune.mint.plsda.R index 48af7631..5ee2e69d 100644 --- a/R/tune.mint.plsda.R +++ b/R/tune.mint.plsda.R @@ -99,7 +99,8 @@ tune.mint.plsda <- max.iter = 100, near.zero.var = FALSE, light.output = TRUE, # if FALSE, output the prediction and classification of each sample during each folds, on each comp, for each repeat - signif.threshold = 0.01 + signif.threshold = 0.01, + ... ) { #-- checking general input parameters --------------------------------------# diff --git a/R/tune.mint.splsda.R b/R/tune.mint.splsda.R index 0f0e3246..659334ed 100644 --- a/R/tune.mint.splsda.R +++ b/R/tune.mint.splsda.R @@ -123,7 +123,8 @@ tune.mint.splsda <- max.iter = 100, near.zero.var = FALSE, light.output = TRUE, # if FALSE, output the prediction and classification of each sample during each folds, on each comp, for each repeat - signif.threshold = 0.01 + signif.threshold = 0.01, + ... ) { #-- checking general input parameters --------------------------------------# diff --git a/man/tune.mint.plsda.Rd b/man/tune.mint.plsda.Rd index 24344086..37d0af38 100644 --- a/man/tune.mint.plsda.Rd +++ b/man/tune.mint.plsda.Rd @@ -18,7 +18,8 @@ tune.mint.plsda( max.iter = 100, near.zero.var = FALSE, light.output = TRUE, - signif.threshold = 0.01 + signif.threshold = 0.01, + ... ) } \arguments{ diff --git a/man/tune.mint.splsda.Rd b/man/tune.mint.splsda.Rd index 566b3e36..73e19710 100644 --- a/man/tune.mint.splsda.Rd +++ b/man/tune.mint.splsda.Rd @@ -20,7 +20,8 @@ tune.mint.splsda( max.iter = 100, near.zero.var = FALSE, light.output = TRUE, - signif.threshold = 0.01 + signif.threshold = 0.01, + ... ) } \arguments{ From 2832fdfb37005506823baef64d081c9a2b58dc3a Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 10:36:49 +1100 Subject: [PATCH 42/75] fixed docs warnings --- R/tune.pls.R | 4 +--- R/tune.spls.R | 1 - man/tune.pls.Rd | 4 +--- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/R/tune.pls.R b/R/tune.pls.R index d9f571aa..a2dabe36 100644 --- a/R/tune.pls.R +++ b/R/tune.pls.R @@ -66,8 +66,7 @@ #' @template arg/BPPARAM #' @param seed set a number here if you want the function to give reproducible outputs. #' Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'. -#' @param limQ2 Q2 threshold for recommending optimal \code{ncomp}. -#' @param ... Optional parameters passed to \code{\link{spls}} +#' @param ... Optional parameters passed to \code{\link{pls}} #' @return #' Returns a list with the following components for every repeat: #' \item{MSEP}{Mean Square Error Prediction for each \eqn{Y} variable, only @@ -102,7 +101,6 @@ #' predicted and actual components for X (t) and Y (u)} #' \item{error.rate} #' -#' #' @author Kim-Anh Lê Cao, Al J Abadi, Benoit Gautier, Francois Bartolo and Florian Rohart. #' @seealso \code{\link{splsda}}, \code{\link{predict.splsda}}, and http://www.mixOmics.org for more details. #' @references mixOmics article: diff --git a/R/tune.spls.R b/R/tune.spls.R index bd08e78c..8efa3b93 100644 --- a/R/tune.spls.R +++ b/R/tune.spls.R @@ -116,7 +116,6 @@ #' predicted and actual components for X (t) and Y (u)} #' \item{error.rate} #' -#' #' @author Kim-Anh Lê Cao, Al J Abadi, Benoit Gautier, Francois Bartolo, #' Florian Rohart, #' @seealso \code{\link{splsda}}, \code{\link{predict.splsda}} and diff --git a/man/tune.pls.Rd b/man/tune.pls.Rd index c6dcadc5..dadee870 100644 --- a/man/tune.pls.Rd +++ b/man/tune.pls.Rd @@ -69,9 +69,7 @@ Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM' \item{progressBar}{Logical. If \code{TRUE} a progress bar is shown as the computation completes. Default to \code{FALSE}.} -\item{...}{Optional parameters passed to \code{\link{spls}}} - -\item{limQ2}{Q2 threshold for recommending optimal \code{ncomp}.} +\item{...}{Optional parameters passed to \code{\link{pls}}} } \value{ Returns a list with the following components for every repeat: From 911d6133bb7ae8d693d78797c34d065d22cddd95 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 10:55:38 +1100 Subject: [PATCH 43/75] fix tune.rcc example --- R/image.tune.rcc.R | 2 +- man/image.tune.rcc.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/image.tune.rcc.R b/R/image.tune.rcc.R index 2585ff56..f462c04d 100644 --- a/R/image.tune.rcc.R +++ b/R/image.tune.rcc.R @@ -24,7 +24,7 @@ #' Y <- nutrimouse$gene #' #' ## this can take some seconds -#' cv.score <- tune.rcc(X, Y, validation = "Mfold", plot = FALSE) +#' cv.score <- tune.rcc(X, Y, validation = "Mfold") #' plot(cv.score) #' #' # image(cv.score) # same result as plot() diff --git a/man/image.tune.rcc.Rd b/man/image.tune.rcc.Rd index c08786af..8729dae9 100644 --- a/man/image.tune.rcc.Rd +++ b/man/image.tune.rcc.Rd @@ -38,7 +38,7 @@ X <- nutrimouse$lipid Y <- nutrimouse$gene ## this can take some seconds -cv.score <- tune.rcc(X, Y, validation = "Mfold", plot = FALSE) +cv.score <- tune.rcc(X, Y, validation = "Mfold") plot(cv.score) # image(cv.score) # same result as plot() From 3e27899fd0919765c3fa9cd3faf296c4725a135c Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 10:58:46 +1100 Subject: [PATCH 44/75] fix warnings in tune.pls --- R/tune.pls.R | 1 - R/tune.spls.R | 1 - man/tune.pls.Rd | 3 +-- man/tune.spls.Rd | 3 +-- 4 files changed, 2 insertions(+), 6 deletions(-) diff --git a/R/tune.pls.R b/R/tune.pls.R index a2dabe36..b882bb98 100644 --- a/R/tune.pls.R +++ b/R/tune.pls.R @@ -99,7 +99,6 @@ #' predicted and actual components for X (t) and Y (u)} #' \item{RSS.tpred, RSS.upred}{Residual Sum of Squares between the #' predicted and actual components for X (t) and Y (u)} -#' \item{error.rate} #' #' @author Kim-Anh Lê Cao, Al J Abadi, Benoit Gautier, Francois Bartolo and Florian Rohart. #' @seealso \code{\link{splsda}}, \code{\link{predict.splsda}}, and http://www.mixOmics.org for more details. diff --git a/R/tune.spls.R b/R/tune.spls.R index 8efa3b93..3821f833 100644 --- a/R/tune.spls.R +++ b/R/tune.spls.R @@ -114,7 +114,6 @@ #' predicted and actual components for X (t) and Y (u)} #' \item{RSS.tpred, RSS.upred}{Residual Sum of Squares between the #' predicted and actual components for X (t) and Y (u)} -#' \item{error.rate} #' #' @author Kim-Anh Lê Cao, Al J Abadi, Benoit Gautier, Francois Bartolo, #' Florian Rohart, diff --git a/man/tune.pls.Rd b/man/tune.pls.Rd index dadee870..4401e4ef 100644 --- a/man/tune.pls.Rd +++ b/man/tune.pls.Rd @@ -102,8 +102,7 @@ if using standard (non-sparse) PLS.} \item{cor.tpred, cor.upred}{Correlation between the predicted and actual components for X (t) and Y (u)} \item{RSS.tpred, RSS.upred}{Residual Sum of Squares between the -predicted and actual components for X (t) and Y (u)} -\item{error.rate} +predicted and actual components for X (t) and Y (u)} } \description{ Computes M-fold or Leave-One-Out Cross-Validation scores on a user-input diff --git a/man/tune.spls.Rd b/man/tune.spls.Rd index c20e22de..5bb87b5d 100644 --- a/man/tune.spls.Rd +++ b/man/tune.spls.Rd @@ -122,8 +122,7 @@ if using standard (non-sparse) PLS.} \item{cor.tpred, cor.upred}{Correlation between the predicted and actual components for X (t) and Y (u)} \item{RSS.tpred, RSS.upred}{Residual Sum of Squares between the -predicted and actual components for X (t) and Y (u)} -\item{error.rate} +predicted and actual components for X (t) and Y (u)} } \description{ Computes M-fold or Leave-One-Out Cross-Validation scores on a user-input From b49d19622bdef16927645f94cb3158a8bd77bda0 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 11:30:47 +1100 Subject: [PATCH 45/75] fix note in perf.assess.pls --- R/perf.assess.pls.R | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/R/perf.assess.pls.R b/R/perf.assess.pls.R index a015d924..8d0dc6b4 100644 --- a/R/perf.assess.pls.R +++ b/R/perf.assess.pls.R @@ -41,6 +41,10 @@ perf.assess.mixo_pls <- function(object, seed = NULL, ...) { + + # define global variables to avoid NOTES in R CMD Check + utils::globalVariables(c("comp", "block", "measure", "value", "feature", "stability", "lower", "upper", "keepX", "keepY")) + # checking args and initialize params ncomp = object$ncomp spls.model <- is(object, 'mixo_spls') @@ -65,7 +69,7 @@ perf.assess.mixo_pls <- function(object, measures <- as.data.frame(measures) # Add this line to remove rows with NAs that correspond to components < ncomp - measures <- dplyr::filter(measures, comp == ncomp) + measures <- dplyr::filter(measures, .data$comp == ncomp) ## R CMD check stuff measure <- feature <- comp <- block <- stability <- value <- NULL @@ -145,6 +149,12 @@ perf.assess.mixo_spls <- perf.assess.mixo_pls { # changes to bypass the loop for the Q2 + + # to deal with NOTE saying comp is not defined + utils::globalVariables(c("comp")) + ## R CMD check stuff + measure <- feature <- comp <- block <- stability <- value <- NULL + lower <- upper <- keepX <- keepY <- NULL ## -------- checks -------- ## if (object$mode == 'invariant') From 26e6e4a22ff2baa85f6e222aa0077a02a007bc3e Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 11:48:14 +1100 Subject: [PATCH 46/75] fix notes and warnings in tune docs --- R/tune.R | 1 - R/tune.block.splsda.R | 3 +-- R/tune.mint.plsda.R | 6 ++++-- R/tune.mint.splsda.R | 6 ++++-- man/tune.block.splsda.Rd | 3 +-- man/tune.mint.plsda.Rd | 3 +-- man/tune.mint.splsda.Rd | 3 +-- 7 files changed, 12 insertions(+), 13 deletions(-) diff --git a/R/tune.R b/R/tune.R index 08ff32f5..de955377 100644 --- a/R/tune.R +++ b/R/tune.R @@ -347,7 +347,6 @@ tune <- # model building params X = X, Y = Y, indY = indY, ncomp = ncomp, design = design, tol = tol, scale = scale, max.iter = max.iter, near.zero.var = near.zero.var, - scheme = scheme, init = init, # hardcoded above # sparsity params test.keepX = test.keepX, already.tested.X = already.tested.X, # CV params diff --git a/R/tune.block.splsda.R b/R/tune.block.splsda.R index f162b04e..1b522c58 100644 --- a/R/tune.block.splsda.R +++ b/R/tune.block.splsda.R @@ -155,8 +155,7 @@ tune.block.splsda <- progressBar = FALSE, light.output = TRUE, # if FALSE, output the prediction and classification of each sample during each folds, on each comp, for each repeat BPPARAM = SerialParam(), - seed = NULL, - ...) + seed = NULL) { if (hasArg('cpus')) #defunct { diff --git a/R/tune.mint.plsda.R b/R/tune.mint.plsda.R index 5ee2e69d..bd6777cb 100644 --- a/R/tune.mint.plsda.R +++ b/R/tune.mint.plsda.R @@ -99,12 +99,14 @@ tune.mint.plsda <- max.iter = 100, near.zero.var = FALSE, light.output = TRUE, # if FALSE, output the prediction and classification of each sample during each folds, on each comp, for each repeat - signif.threshold = 0.01, - ... + signif.threshold = 0.01 ) { #-- checking general input parameters --------------------------------------# #---------------------------------------------------------------------------# + + ## R CMD check stuff + BPPARAM <- seed <- NULL #------------------# #-- check entries --# diff --git a/R/tune.mint.splsda.R b/R/tune.mint.splsda.R index 659334ed..01419a67 100644 --- a/R/tune.mint.splsda.R +++ b/R/tune.mint.splsda.R @@ -123,12 +123,14 @@ tune.mint.splsda <- max.iter = 100, near.zero.var = FALSE, light.output = TRUE, # if FALSE, output the prediction and classification of each sample during each folds, on each comp, for each repeat - signif.threshold = 0.01, - ... + signif.threshold = 0.01 ) { #-- checking general input parameters --------------------------------------# #---------------------------------------------------------------------------# + + ## R CMD check stuff + BPPARAM <- seed <- NULL #------------------# #-- check entries --# diff --git a/man/tune.block.splsda.Rd b/man/tune.block.splsda.Rd index 3294a683..0b365683 100644 --- a/man/tune.block.splsda.Rd +++ b/man/tune.block.splsda.Rd @@ -27,8 +27,7 @@ tune.block.splsda( progressBar = FALSE, light.output = TRUE, BPPARAM = SerialParam(), - seed = NULL, - ... + seed = NULL ) } \arguments{ diff --git a/man/tune.mint.plsda.Rd b/man/tune.mint.plsda.Rd index 37d0af38..24344086 100644 --- a/man/tune.mint.plsda.Rd +++ b/man/tune.mint.plsda.Rd @@ -18,8 +18,7 @@ tune.mint.plsda( max.iter = 100, near.zero.var = FALSE, light.output = TRUE, - signif.threshold = 0.01, - ... + signif.threshold = 0.01 ) } \arguments{ diff --git a/man/tune.mint.splsda.Rd b/man/tune.mint.splsda.Rd index 73e19710..566b3e36 100644 --- a/man/tune.mint.splsda.Rd +++ b/man/tune.mint.splsda.Rd @@ -20,8 +20,7 @@ tune.mint.splsda( max.iter = 100, near.zero.var = FALSE, light.output = TRUE, - signif.threshold = 0.01, - ... + signif.threshold = 0.01 ) } \arguments{ From 11262b8818bd339433969bd28f55a7ea028a1b80 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 11:58:53 +1100 Subject: [PATCH 47/75] fix unit test errors --- R/perf.assess.pls.R | 5 ----- 1 file changed, 5 deletions(-) diff --git a/R/perf.assess.pls.R b/R/perf.assess.pls.R index 8d0dc6b4..01ef7f5b 100644 --- a/R/perf.assess.pls.R +++ b/R/perf.assess.pls.R @@ -42,9 +42,6 @@ perf.assess.mixo_pls <- function(object, ...) { - # define global variables to avoid NOTES in R CMD Check - utils::globalVariables(c("comp", "block", "measure", "value", "feature", "stability", "lower", "upper", "keepX", "keepY")) - # checking args and initialize params ncomp = object$ncomp spls.model <- is(object, 'mixo_spls') @@ -150,8 +147,6 @@ perf.assess.mixo_spls <- perf.assess.mixo_pls { # changes to bypass the loop for the Q2 - # to deal with NOTE saying comp is not defined - utils::globalVariables(c("comp")) ## R CMD check stuff measure <- feature <- comp <- block <- stability <- value <- NULL lower <- upper <- keepX <- keepY <- NULL From 2380e3f62e2b396b8da3573399e1e3eb9f0a6552 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 12:10:24 +1100 Subject: [PATCH 48/75] fix notes in docs --- R/perf.assess.R | 14 +++++++------- man/perf.assess.Rd | 10 +++++----- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/R/perf.assess.R b/R/perf.assess.R index 60740196..fb90d107 100644 --- a/R/perf.assess.R +++ b/R/perf.assess.R @@ -89,18 +89,18 @@ #' @param object object of class inherited from \code{"pls"}, \code{"plsda"}, #' \code{"spls"}, \code{"splsda"} or \code{"mint.splsda"}. The function will #' retrieve some key parameters stored in that object. +#' @param validation a character string. What kind of (internal) validation to use, +#' matching one of \code{"Mfold"} or \code{"loo"} (see below). Default is +#' \code{"Mfold"}. For MINT methods only \code{"loo"} will be used. +#' @param folds numeric. Number of folds in the Mfold cross-validation. See Details. +#' @param nrepeat numierc. Number of times the Cross-Validation process is repeated. +#' This is an important argument to ensure the estimation of the performance to +#' be as accurate as possible. Default it 1. #' @param dist only applies to an object inheriting from \code{"plsda"}, #' \code{"splsda"} or \code{"mint.splsda"} to evaluate the classification #' performance of the model. Should be a subset of \code{"max.dist"}, #' \code{"centroids.dist"}, \code{"mahalanobis.dist"}. Default is \code{"all"}. #' See \code{\link{predict}}. -#' @param validation character. What kind of (internal) validation to use, -#' matching one of \code{"Mfold"} or \code{"loo"} (see below). Default is -#' \code{"Mfold"}. -#' @param folds the folds in the Mfold cross-validation. See Details. -#' @param nrepeat Number of times the Cross-Validation process is repeated. -#' This is an important argument to ensure the estimation of the performance to -#' be as accurate as possible. #' @param auc if \code{TRUE} calculate the Area Under the Curve (AUC) #' performance of the model. #' @param progressBar by default set to \code{FALSE} to output the progress bar diff --git a/man/perf.assess.Rd b/man/perf.assess.Rd index 264a7361..8f260eaa 100644 --- a/man/perf.assess.Rd +++ b/man/perf.assess.Rd @@ -109,15 +109,15 @@ performance of the model. Should be a subset of \code{"max.dist"}, \code{"centroids.dist"}, \code{"mahalanobis.dist"}. Default is \code{"all"}. See \code{\link{predict}}.} -\item{validation}{character. What kind of (internal) validation to use, +\item{validation}{a character string. What kind of (internal) validation to use, matching one of \code{"Mfold"} or \code{"loo"} (see below). Default is -\code{"Mfold"}.} +\code{"Mfold"}. For MINT methods only \code{"loo"} will be used.} -\item{folds}{the folds in the Mfold cross-validation. See Details.} +\item{folds}{numeric. Number of folds in the Mfold cross-validation. See Details.} -\item{nrepeat}{Number of times the Cross-Validation process is repeated. +\item{nrepeat}{numierc. Number of times the Cross-Validation process is repeated. This is an important argument to ensure the estimation of the performance to -be as accurate as possible.} +be as accurate as possible. Default it 1.} \item{auc}{if \code{TRUE} calculate the Area Under the Curve (AUC) performance of the model.} From 9d3f7c9fa45ea4bdb4be5f60103c951af57d5e44 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 13:20:54 +1100 Subject: [PATCH 49/75] rmove dist checking so can plot multiple dist metrics using internal perf function --- R/tune.mint.plsda.R | 4 ---- R/tune.mint.splsda.R | 11 +++++++---- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/R/tune.mint.plsda.R b/R/tune.mint.plsda.R index bd6777cb..d73de6ee 100644 --- a/R/tune.mint.plsda.R +++ b/R/tune.mint.plsda.R @@ -178,10 +178,6 @@ tune.mint.plsda <- if(sum(apply(table(Y,study)==0,2,sum)>0) >0) warning("At least one study does not contain all the levels of the outcome Y. The MINT algorithm might not perform as expected.") - - #-- dist - dist = match.arg(dist) - #-- light.output if (!is.logical(light.output)) stop("'light.output' must be either TRUE or FALSE", call. = FALSE) diff --git a/R/tune.mint.splsda.R b/R/tune.mint.splsda.R index 01419a67..72813190 100644 --- a/R/tune.mint.splsda.R +++ b/R/tune.mint.splsda.R @@ -218,10 +218,6 @@ tune.mint.splsda <- if(sum(apply(table(Y,study)==0,2,sum)>0) >0) warning("At least one study does not contain all the levels of the outcome Y. The MINT algorithm might not perform as expected.") - - #-- dist - dist = match.arg(dist) - #-- light.output if (!is.logical(light.output)) stop("'light.output' must be either TRUE or FALSE", call. = FALSE) @@ -239,6 +235,13 @@ tune.mint.splsda <- } else { if (is.null(test.keepX) | length(test.keepX) == 1 | !is.numeric(test.keepX)) stop("'test.keepX' must be a numeric vector with more than two entries", call. = FALSE) + + #-- dist (for tune can only have one dist, for perf can have multiple) + dist = match.arg( + dist, + choices = c("max.dist", "centroids.dist", "mahalanobis.dist"), + several.ok = TRUE + ) #-- end checking --# #------------------# From 326518e3b445291002f7e1e63cf19c1d79aaadff Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 13:41:43 +1100 Subject: [PATCH 50/75] clean tune tune.plsda and tune.splsda input args --- R/tune.plsda.R | 20 ++++++++++------- R/tune.splsda.R | 15 ++++++++----- man/tune.plsda.Rd | 54 +++++++++++++++++++++++----------------------- man/tune.splsda.Rd | 24 ++++++++++----------- 4 files changed, 61 insertions(+), 52 deletions(-) diff --git a/R/tune.plsda.R b/R/tune.plsda.R index 1fd6e294..34e74cc7 100644 --- a/R/tune.plsda.R +++ b/R/tune.plsda.R @@ -126,20 +126,24 @@ tune.plsda <- function (X, Y, ncomp = 1, + # params related to spls model building + scale = TRUE, + logratio = c('none','CLR'), + max.iter = 100, + tol = 1e-06, + near.zero.var = FALSE, + multilevel = NULL, + # params related to CV validation = "Mfold", folds = 10, + nrepeat = 1, + signif.threshold = 0.01, + # params related to PA dist = "all", - scale = TRUE, auc = FALSE, + # params related to running progressBar = FALSE, - tol = 1e-06, - max.iter = 100, - near.zero.var = FALSE, - nrepeat = 1, - logratio = c('none','CLR'), - multilevel = NULL, light.output = TRUE, - signif.threshold = 0.01, BPPARAM = SerialParam(), seed = NULL ) diff --git a/R/tune.splsda.R b/R/tune.splsda.R index 283ab5e8..bc6ce44a 100644 --- a/R/tune.splsda.R +++ b/R/tune.splsda.R @@ -163,23 +163,28 @@ tune.splsda <- function (X, Y, ncomp = 1, + # sparsity params test.keepX = NULL, # in which case perf() is run internally to only tune ncomp already.tested.X, + # params related to spls model building + scale = TRUE, + logratio = c('none','CLR'), max.iter = 100, tol = 1e-06, - scale = TRUE, near.zero.var = FALSE, - logratio = c('none','CLR'), multilevel = NULL, + # params related to CV validation = "Mfold", folds = 10, nrepeat = 1, signif.threshold = 0.01, - dist = NULL, - measure = "BER", # one of c("overall","BER","AUC") + # params related to PA + dist = "max.dist", + measure = "BER", # or 'overall' auc = FALSE, - light.output = TRUE, + # params related to running progressBar = FALSE, + light.output = TRUE, BPPARAM = SerialParam(), seed = NULL ) diff --git a/man/tune.plsda.Rd b/man/tune.plsda.Rd index f87974be..63dff237 100644 --- a/man/tune.plsda.Rd +++ b/man/tune.plsda.Rd @@ -8,20 +8,20 @@ tune.plsda( X, Y, ncomp = 1, + scale = TRUE, + logratio = c("none", "CLR"), + max.iter = 100, + tol = 1e-06, + near.zero.var = FALSE, + multilevel = NULL, validation = "Mfold", folds = 10, + nrepeat = 1, + signif.threshold = 0.01, dist = "all", - scale = TRUE, auc = FALSE, progressBar = FALSE, - tol = 1e-06, - max.iter = 100, - near.zero.var = FALSE, - nrepeat = 1, - logratio = c("none", "CLR"), - multilevel = NULL, light.output = TRUE, - signif.threshold = 0.01, BPPARAM = SerialParam(), seed = NULL ) @@ -34,45 +34,45 @@ responses (for multi-response models) \code{NA}s are allowed.} \item{ncomp}{the number of components to include in the model.} -\item{validation}{character. What kind of (internal) validation to use, -matching one of \code{"Mfold"} or \code{"loo"} (short for 'leave-one-out'). -Default is \code{"Mfold"}.} - -\item{folds}{the folds in the Mfold cross-validation. See Details.} - -\item{dist}{Distance metric. Should be a subset of "max.dist", "centroids.dist", "mahalanobis.dist" or "all". Default is "all"} - \item{scale}{Logical. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE)} -\item{auc}{if \code{TRUE} calculate the Area Under the Curve (AUC)} +\item{logratio}{one of ('none','CLR'). Default to 'none'} -\item{progressBar}{by default set to \code{TRUE} to output the progress bar -of the computation.} +\item{max.iter}{integer, the maximum number of iterations.} \item{tol}{Convergence stopping value.} -\item{max.iter}{integer, the maximum number of iterations.} - \item{near.zero.var}{Logical, see the internal \code{\link{nearZeroVar}} function (should be set to TRUE in particular for data with many zero values). Default value is FALSE} -\item{nrepeat}{Number of times the Cross-Validation process is repeated.} - -\item{logratio}{one of ('none','CLR'). Default to 'none'} - \item{multilevel}{Design matrix for multilevel analysis (for repeated measurements) that indicates the repeated measures on each individual, i.e. the individuals ID. See Details.} -\item{light.output}{if set to FALSE, the prediction/classification of each -sample for each of \code{test.keepX} and each comp is returned.} +\item{validation}{character. What kind of (internal) validation to use, +matching one of \code{"Mfold"} or \code{"loo"} (short for 'leave-one-out'). +Default is \code{"Mfold"}.} + +\item{folds}{the folds in the Mfold cross-validation. See Details.} + +\item{nrepeat}{Number of times the Cross-Validation process is repeated.} \item{signif.threshold}{numeric between 0 and 1 indicating the significance threshold required for improvement in error rate of the components. Default to 0.01.} +\item{dist}{Distance metric. Should be a subset of "max.dist", "centroids.dist", "mahalanobis.dist" or "all". Default is "all"} + +\item{auc}{if \code{TRUE} calculate the Area Under the Curve (AUC)} + +\item{progressBar}{by default set to \code{TRUE} to output the progress bar +of the computation.} + +\item{light.output}{if set to FALSE, the prediction/classification of each +sample for each of \code{test.keepX} and each comp is returned.} + \item{BPPARAM}{A \linkS4class{BiocParallelParam} object indicating the type of parallelisation. See examples in \code{?tune.spca}.} diff --git a/man/tune.splsda.Rd b/man/tune.splsda.Rd index 18b95bb1..770da82f 100644 --- a/man/tune.splsda.Rd +++ b/man/tune.splsda.Rd @@ -10,21 +10,21 @@ tune.splsda( ncomp = 1, test.keepX = NULL, already.tested.X, + scale = TRUE, + logratio = c("none", "CLR"), max.iter = 100, tol = 1e-06, - scale = TRUE, near.zero.var = FALSE, - logratio = c("none", "CLR"), multilevel = NULL, validation = "Mfold", folds = 10, nrepeat = 1, signif.threshold = 0.01, - dist = NULL, + dist = "max.dist", measure = "BER", auc = FALSE, - light.output = TRUE, progressBar = FALSE, + light.output = TRUE, BPPARAM = SerialParam(), seed = NULL ) @@ -45,19 +45,19 @@ using all variables in the \eqn{X} data set.} indicating the number of variables to select from the \eqn{X} data set on the firsts components.} +\item{scale}{Logical. If scale = TRUE, each block is standardized to zero +means and unit variances (default: TRUE)} + +\item{logratio}{one of ('none','CLR'). Default to 'none'} + \item{max.iter}{integer, the maximum number of iterations.} \item{tol}{Convergence stopping value.} -\item{scale}{Logical. If scale = TRUE, each block is standardized to zero -means and unit variances (default: TRUE)} - \item{near.zero.var}{Logical, see the internal \code{\link{nearZeroVar}} function (should be set to TRUE in particular for data with many zero values). Default value is FALSE} -\item{logratio}{one of ('none','CLR'). Default to 'none'} - \item{multilevel}{Design matrix for multilevel analysis (for repeated measurements) that indicates the repeated measures on each individual, i.e. the individuals ID. See Details.} @@ -86,12 +86,12 @@ or the Area Under the Curve \code{AUC}. Only used when \code{test.keepX} is not \item{auc}{if \code{TRUE} calculate the Area Under the Curve (AUC) performance of the model based on the optimisation measure \code{measure}.} -\item{light.output}{if set to FALSE, the prediction/classification of each -sample for each of \code{test.keepX} and each comp is returned.} - \item{progressBar}{by default set to \code{TRUE} to output the progress bar of the computation.} +\item{light.output}{if set to FALSE, the prediction/classification of each +sample for each of \code{test.keepX} and each comp is returned.} + \item{BPPARAM}{A \linkS4class{BiocParallelParam} object indicating the type of parallelisation. See examples in \code{?tune.spca}.} From 5dcc7770ae1d9d2ea11cec546f661abf0e97c3fe Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 13:52:15 +1100 Subject: [PATCH 51/75] reorg tune.mint input args --- R/tune.mint.plsda.R | 16 ++++++++++------ R/tune.mint.splsda.R | 18 ++++++++++++------ man/tune.mint.plsda.Rd | 42 ++++++++++++++++++++--------------------- man/tune.mint.splsda.Rd | 42 ++++++++++++++++++++--------------------- 4 files changed, 64 insertions(+), 54 deletions(-) diff --git a/R/tune.mint.plsda.R b/R/tune.mint.plsda.R index d73de6ee..ceddbb4c 100644 --- a/R/tune.mint.plsda.R +++ b/R/tune.mint.plsda.R @@ -90,16 +90,20 @@ tune.mint.plsda <- Y, ncomp = 1, study, - dist = c("max.dist", "centroids.dist", "mahalanobis.dist"), - measure = c("BER", "overall"), - auc = FALSE, - progressBar = FALSE, + # model building params scale = TRUE, tol = 1e-06, max.iter = 100, near.zero.var = FALSE, - light.output = TRUE, # if FALSE, output the prediction and classification of each sample during each folds, on each comp, for each repeat - signif.threshold = 0.01 + # CV params + signif.threshold = 0.01, + # PA params + dist = c("max.dist", "centroids.dist", "mahalanobis.dist"), + measure = c("BER", "overall"), + auc = FALSE, + # running params + progressBar = FALSE, + light.output = TRUE # if FALSE, output the prediction and classification of each sample during each folds, on each comp, for each repeat ) { #-- checking general input parameters --------------------------------------# diff --git a/R/tune.mint.splsda.R b/R/tune.mint.splsda.R index 72813190..cc8bb00c 100644 --- a/R/tune.mint.splsda.R +++ b/R/tune.mint.splsda.R @@ -112,18 +112,24 @@ tune.mint.splsda <- Y, ncomp = 1, study, + # sparsity params test.keepX = NULL, already.tested.X, - dist = c("max.dist", "centroids.dist", "mahalanobis.dist"), - measure = c("BER", "overall"), - auc = FALSE, - progressBar = FALSE, + # model building params scale = TRUE, tol = 1e-06, max.iter = 100, near.zero.var = FALSE, - light.output = TRUE, # if FALSE, output the prediction and classification of each sample during each folds, on each comp, for each repeat - signif.threshold = 0.01 + # CV params + signif.threshold = 0.01, + # PA params + dist = c("max.dist", "centroids.dist", "mahalanobis.dist"), + measure = c("BER", "overall"), + auc = FALSE, + # running params + progressBar = FALSE, + light.output = TRUE # if FALSE, output the prediction and classification of each sample during each folds, on each comp, for each repeat + ) { #-- checking general input parameters --------------------------------------# diff --git a/man/tune.mint.plsda.Rd b/man/tune.mint.plsda.Rd index 24344086..d5cb1260 100644 --- a/man/tune.mint.plsda.Rd +++ b/man/tune.mint.plsda.Rd @@ -9,16 +9,16 @@ tune.mint.plsda( Y, ncomp = 1, study, - dist = c("max.dist", "centroids.dist", "mahalanobis.dist"), - measure = c("BER", "overall"), - auc = FALSE, - progressBar = FALSE, scale = TRUE, tol = 1e-06, max.iter = 100, near.zero.var = FALSE, - light.output = TRUE, - signif.threshold = 0.01 + signif.threshold = 0.01, + dist = c("max.dist", "centroids.dist", "mahalanobis.dist"), + measure = c("BER", "overall"), + auc = FALSE, + progressBar = FALSE, + light.output = TRUE ) } \arguments{ @@ -33,6 +33,21 @@ Default to 1} \item{study}{grouping factor indicating which samples are from the same study} +\item{scale}{Logical. If scale = TRUE, each block is standardized to zero +means and unit variances (default: TRUE)} + +\item{tol}{Convergence stopping value.} + +\item{max.iter}{integer, the maximum number of iterations.} + +\item{near.zero.var}{Logical, see the internal \code{\link{nearZeroVar}} +function (should be set to TRUE in particular for data with many zero +values). Default value is FALSE} + +\item{signif.threshold}{numeric between 0 and 1 indicating the significance +threshold required for improvement in error rate of the components. Default +to 0.01.} + \item{dist}{only applies to an object inheriting from \code{"plsda"} or \code{"splsda"} to evaluate the classification performance of the model. Should be a subset of \code{"max.dist"}, \code{"centroids.dist"}, @@ -48,23 +63,8 @@ performance of the model.} \item{progressBar}{by default set to \code{TRUE} to output the progress bar of the computation.} -\item{scale}{Logical. If scale = TRUE, each block is standardized to zero -means and unit variances (default: TRUE)} - -\item{tol}{Convergence stopping value.} - -\item{max.iter}{integer, the maximum number of iterations.} - -\item{near.zero.var}{Logical, see the internal \code{\link{nearZeroVar}} -function (should be set to TRUE in particular for data with many zero -values). Default value is FALSE} - \item{light.output}{if set to FALSE, the prediction/classification of each sample for each of \code{test.keepX} and each comp is returned.} - -\item{signif.threshold}{numeric between 0 and 1 indicating the significance -threshold required for improvement in error rate of the components. Default -to 0.01.} } \value{ The returned value is a list with components: diff --git a/man/tune.mint.splsda.Rd b/man/tune.mint.splsda.Rd index 566b3e36..a85925ec 100644 --- a/man/tune.mint.splsda.Rd +++ b/man/tune.mint.splsda.Rd @@ -11,16 +11,16 @@ tune.mint.splsda( study, test.keepX = NULL, already.tested.X, - dist = c("max.dist", "centroids.dist", "mahalanobis.dist"), - measure = c("BER", "overall"), - auc = FALSE, - progressBar = FALSE, scale = TRUE, tol = 1e-06, max.iter = 100, near.zero.var = FALSE, - light.output = TRUE, - signif.threshold = 0.01 + signif.threshold = 0.01, + dist = c("max.dist", "centroids.dist", "mahalanobis.dist"), + measure = c("BER", "overall"), + auc = FALSE, + progressBar = FALSE, + light.output = TRUE ) } \arguments{ @@ -42,6 +42,21 @@ test from the \eqn{X} data set. If set to NULL only number of components will be number of variables to select from the \eqn{X} data set on the firsts components} +\item{scale}{Logical. If scale = TRUE, each block is standardized to zero +means and unit variances (default: TRUE)} + +\item{tol}{Convergence stopping value.} + +\item{max.iter}{integer, the maximum number of iterations.} + +\item{near.zero.var}{Logical, see the internal \code{\link{nearZeroVar}} +function (should be set to TRUE in particular for data with many zero +values). Default value is FALSE} + +\item{signif.threshold}{numeric between 0 and 1 indicating the significance +threshold required for improvement in error rate of the components. Default +to 0.01.} + \item{dist}{only applies to an object inheriting from \code{"plsda"} or \code{"splsda"} to evaluate the classification performance of the model. Should be a subset of \code{"max.dist"}, \code{"centroids.dist"}, @@ -57,23 +72,8 @@ performance of the model.} \item{progressBar}{by default set to \code{TRUE} to output the progress bar of the computation.} -\item{scale}{Logical. If scale = TRUE, each block is standardized to zero -means and unit variances (default: TRUE)} - -\item{tol}{Convergence stopping value.} - -\item{max.iter}{integer, the maximum number of iterations.} - -\item{near.zero.var}{Logical, see the internal \code{\link{nearZeroVar}} -function (should be set to TRUE in particular for data with many zero -values). Default value is FALSE} - \item{light.output}{if set to FALSE, the prediction/classification of each sample for each of \code{test.keepX} and each comp is returned.} - -\item{signif.threshold}{numeric between 0 and 1 indicating the significance -threshold required for improvement in error rate of the components. Default -to 0.01.} } \value{ The returned value is a list with components: From 0d70ff5036c8227ca075ec58af7ab310b25227f8 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 13:55:16 +1100 Subject: [PATCH 52/75] how to use 'measure' arg in tune mint functions --- R/tune.mint.plsda.R | 7 ------- R/tune.mint.splsda.R | 9 +++++---- man/tune.mint.plsda.Rd | 4 ---- man/tune.mint.splsda.Rd | 3 ++- 4 files changed, 7 insertions(+), 16 deletions(-) diff --git a/R/tune.mint.plsda.R b/R/tune.mint.plsda.R index ceddbb4c..28dca587 100644 --- a/R/tune.mint.plsda.R +++ b/R/tune.mint.plsda.R @@ -39,8 +39,6 @@ #' Should be a subset of \code{"max.dist"}, \code{"centroids.dist"}, #' \code{"mahalanobis.dist"}. Default is \code{"all"}. See #' \code{\link{predict}}. -#' @param measure Two misclassification measure are available: overall -#' misclassification error \code{overall} or the Balanced Error Rate \code{BER} #' @param auc if \code{TRUE} calculate the Area Under the Curve (AUC) #' performance of the model. #' @param progressBar by default set to \code{TRUE} to output the progress bar @@ -99,7 +97,6 @@ tune.mint.plsda <- signif.threshold = 0.01, # PA params dist = c("max.dist", "centroids.dist", "mahalanobis.dist"), - measure = c("BER", "overall"), auc = FALSE, # running params progressBar = FALSE, @@ -150,10 +147,6 @@ tune.mint.plsda <- if (is.null(ncomp) || !is.numeric(ncomp) || ncomp <= 0) stop("invalid number of variates, 'ncomp'.") - - #-- measure - measure <- match.arg(measure) - # -- check using the check of mint.splsda Y.mat = unmap(Y) colnames(Y.mat) = levels(Y) diff --git a/R/tune.mint.splsda.R b/R/tune.mint.splsda.R index cc8bb00c..5b97f0a1 100644 --- a/R/tune.mint.splsda.R +++ b/R/tune.mint.splsda.R @@ -48,7 +48,8 @@ #' \code{"mahalanobis.dist"}. Default is \code{"all"}. See #' \code{\link{predict}}. #' @param measure Two misclassification measure are available: overall -#' misclassification error \code{overall} or the Balanced Error Rate \code{BER} +#' misclassification error \code{overall} or the Balanced Error Rate \code{BER}. +#' Only used when \code{test.keepX = NULL}. #' @param auc if \code{TRUE} calculate the Area Under the Curve (AUC) #' performance of the model. #' @param progressBar by default set to \code{TRUE} to output the progress bar @@ -177,9 +178,6 @@ tune.mint.splsda <- stop("invalid number of variates, 'ncomp'.") - #-- measure - measure <- match.arg(measure) - #if ((!is.null(already.tested.X)) && (length(already.tested.X) != (ncomp - 1)) ) #stop("The number of already tested parameters should be NULL or ", ncomp - 1, " since you set ncomp = ", ncomp) if (missing(already.tested.X)) @@ -248,6 +246,9 @@ tune.mint.splsda <- choices = c("max.dist", "centroids.dist", "mahalanobis.dist"), several.ok = TRUE ) + + #-- measure + measure <- match.arg(measure) #-- end checking --# #------------------# diff --git a/man/tune.mint.plsda.Rd b/man/tune.mint.plsda.Rd index d5cb1260..75df5ec3 100644 --- a/man/tune.mint.plsda.Rd +++ b/man/tune.mint.plsda.Rd @@ -15,7 +15,6 @@ tune.mint.plsda( near.zero.var = FALSE, signif.threshold = 0.01, dist = c("max.dist", "centroids.dist", "mahalanobis.dist"), - measure = c("BER", "overall"), auc = FALSE, progressBar = FALSE, light.output = TRUE @@ -54,9 +53,6 @@ Should be a subset of \code{"max.dist"}, \code{"centroids.dist"}, \code{"mahalanobis.dist"}. Default is \code{"all"}. See \code{\link{predict}}.} -\item{measure}{Two misclassification measure are available: overall -misclassification error \code{overall} or the Balanced Error Rate \code{BER}} - \item{auc}{if \code{TRUE} calculate the Area Under the Curve (AUC) performance of the model.} diff --git a/man/tune.mint.splsda.Rd b/man/tune.mint.splsda.Rd index a85925ec..be0f7386 100644 --- a/man/tune.mint.splsda.Rd +++ b/man/tune.mint.splsda.Rd @@ -64,7 +64,8 @@ Should be a subset of \code{"max.dist"}, \code{"centroids.dist"}, \code{\link{predict}}.} \item{measure}{Two misclassification measure are available: overall -misclassification error \code{overall} or the Balanced Error Rate \code{BER}} +misclassification error \code{overall} or the Balanced Error Rate \code{BER}. +Only used when \code{test.keepX = NULL}.} \item{auc}{if \code{TRUE} calculate the Area Under the Curve (AUC) performance of the model.} From 217a2d1b9c71cb2d7b65150985fdd2606fe405fe Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 14:07:41 +1100 Subject: [PATCH 53/75] fix unit test --- tests/testthat/test-tune.splsda.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-tune.splsda.R b/tests/testthat/test-tune.splsda.R index 3411df54..516b10d6 100644 --- a/tests/testthat/test-tune.splsda.R +++ b/tests/testthat/test-tune.splsda.R @@ -68,5 +68,5 @@ test_that("tune.splsda works when test.keepX = NULL and gives same result as per ) # check results match - expect_equal(tune_res$error.rate$overall[1,2], perf_res$error.rate$overall[1,2]) + expect_equal(tune_res$error.rate$overall[2,1], perf_res$error.rate$overall[2,1]) }) From 4ee868de6753455eafd4647d8a691c0d6575ecfc Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 14:17:15 +1100 Subject: [PATCH 54/75] add unit tests for tune.pca and match default args --- R/tune.pca.R | 4 ++-- man/tune.pca.Rd | 4 ++-- tests/testthat/test-tune.pca.R | 11 +++++++++++ 3 files changed, 15 insertions(+), 4 deletions(-) create mode 100644 tests/testthat/test-tune.pca.R diff --git a/R/tune.pca.R b/R/tune.pca.R index 449daa36..e1a62e2e 100644 --- a/R/tune.pca.R +++ b/R/tune.pca.R @@ -61,8 +61,8 @@ tune.pca <- function(X, ncomp = NULL, center = TRUE, # sets the mean of the data to zero, ensures that the first PC describes the direction of the maximum variance - scale = FALSE, # variance is unit across different units - max.iter = 500, + scale = TRUE, # variance is unit across different units + max.iter = 100, tol = 1e-09, logratio = c('none','CLR','ILR'), V = NULL, diff --git a/man/tune.pca.Rd b/man/tune.pca.Rd index 8e8593a3..1135e1f3 100644 --- a/man/tune.pca.Rd +++ b/man/tune.pca.Rd @@ -8,8 +8,8 @@ tune.pca( X, ncomp = NULL, center = TRUE, - scale = FALSE, - max.iter = 500, + scale = TRUE, + max.iter = 100, tol = 1e-09, logratio = c("none", "CLR", "ILR"), V = NULL, diff --git a/tests/testthat/test-tune.pca.R b/tests/testthat/test-tune.pca.R new file mode 100644 index 00000000..c1bcfa8f --- /dev/null +++ b/tests/testthat/test-tune.pca.R @@ -0,0 +1,11 @@ +context("tune.pca") + +test_that("tune.pca and tune(method='pca') are equivalent", { + data(srbct) + X <- srbct$gene[1:20, 1:200] + object1 <- tune.pca(X, ncomp = 2, center = TRUE, scale = TRUE) + object2 <- tune(method = "pca", X, ncomp = 2, center = TRUE, scale = TRUE) + # expect results the same + expect_equal(object1$ncomp, object2$ncomp) + expect_equal(object1$sdev[1], object2$sdev[1]) +}) From 768f9f95bff115708ee73928efa5cdec956b9fc6 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 14:17:22 +1100 Subject: [PATCH 55/75] add new functions to tune wrapper --- R/tune.R | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 84 insertions(+), 4 deletions(-) diff --git a/R/tune.R b/R/tune.R index de955377..7a100dff 100644 --- a/R/tune.R +++ b/R/tune.R @@ -272,6 +272,26 @@ tune <- # running params BPPARAM = BPPARAM, seed = seed) + ## ----------- PLS ----------- ## + + } else if (method == "pls") { + message("Calling 'tune.pls'") + + result = tune.pls( + # model building params + X = X, Y = Y, ncomp = ncomp, mode = mode, + scale = scale, logratio = logratio, tol = tol, + max.iter = max.iter, near.zero.var = near.zero.var, + # CV params + validation = validation, nrepeat = nrepeat, folds = folds, + # PA params + measure = measure, + # running params + progressBar = progressBar, + BPPARAM = BPPARAM, seed = seed) + + # multilevel?? + ## ----------- sPLS +/- multilevel ----------- ## } else if (method == "spls") { @@ -282,6 +302,8 @@ tune <- result = tune.spls( # model building params X = X, Y = Y, ncomp = ncomp, mode = mode, + scale = scale, logratio = logratio, tol = tol, + max.iter = max.iter, near.zero.var = near.zero.var, # sparsity params test.keepX = test.keepX, test.keepY = test.keepY, # CV params @@ -314,6 +336,28 @@ tune <- BPPARAM = BPPARAM, seed = seed) } + ## ----------- plsda ----------- ## + + } else if (method == "plsda") { + + message("Calling 'tune.plsda'") + + if (missing(ncomp)) + ncomp = 1 + + result = tune.plsda( + # model building params + X = X, Y = Y, ncomp = ncomp, multilevel = multilevel, + scale = scale, logratio = logratio, tol = tol, + max.iter = max.iter, near.zero.var = near.zero.var, + # CV params + validation = validation, folds = folds, nrepeat = nrepeat, + # PA params + dist = dist, auc = auc, + # running params + progressBar = progressBar, light.output = light.output, + BPPARAM = BPPARAM, seed = seed) + ## ----------- splsda ----------- ## } else if (method == "splsda") { @@ -326,7 +370,8 @@ tune <- result = tune.splsda( # model building params X = X, Y = Y, ncomp = ncomp, multilevel = multilevel, - logratio = logratio, max.iter = max.iter, near.zero.var = near.zero.var, + scale = scale, logratio = logratio, tol = tol, + max.iter = max.iter, near.zero.var = near.zero.var, # sparsity params test.keepX = test.keepX, already.tested.X = already.tested.X, # CV params @@ -338,6 +383,24 @@ tune <- BPPARAM = BPPARAM, seed = seed) + ## ----------- block.plsda ----------- ## + + } else if (method == "block.plsda") { + message("Calling 'tune.block.plsda'") + + result = tune.block.plsda( + # model building params + X = X, Y = Y, indY = indY, ncomp = ncomp, design = design, + tol = tol, scale = scale, max.iter = max.iter, near.zero.var = near.zero.var, + # CV params + validation = validation, folds = folds, nrepeat = nrepeat, signif.threshold = signif.threshold, + # PA params + dist = dist, weighted = weighted, + # running params + progressBar = progressBar, light.output = light.output, + BPPARAM = BPPARAM, seed = seed) + + ## ----------- block.splsda ----------- ## } else if (method == "block.splsda") { @@ -350,14 +413,31 @@ tune <- # sparsity params test.keepX = test.keepX, already.tested.X = already.tested.X, # CV params - validation = validation, folds = folds, nrepeat = nrepeat, + validation = validation, folds = folds, nrepeat = nrepeat, signif.threshold = signif.threshold, # PA params - dist = dist, measure = measure, - weighted = weighted, signif.threshold = signif.threshold, + dist = dist, weighted = weighted, measure = measure, # running params progressBar = progressBar, light.output = light.output, BPPARAM = BPPARAM, seed = seed) + + ## ----------- mint.plsda ----------- ## + + } else if (method == "mint.plsda") { + message("Calling 'tune.mint.plsda' with Leave-One-Group-Out Cross Validation (nrepeat = 1)") + + if (missing(ncomp)) + ncomp = 1 + + result = tune.mint.plsda( + # model building params + X = X, Y = Y, ncomp = ncomp, study = study, + scale = scale, tol = tol, max.iter = max.iter, near.zero.var = near.zero.var, + # PA params + dist = dist, auc = auc, + # running params - no need for seed or BPPARAM due to LOO CV + progressBar = progressBar,light.output = light.output) + ## ----------- mint.splsda ----------- ## From 58b8fb145ca103cf07cb06885ccf80b94e71f6b6 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 14:41:48 +1100 Subject: [PATCH 56/75] update output docs for perf --- R/perf.R | 29 +++++++++++++++++++---------- man/perf.Rd | 27 +++++++++++++++++---------- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/R/perf.R b/R/perf.R index 5e713cbd..9ef66147 100644 --- a/R/perf.R +++ b/R/perf.R @@ -110,6 +110,7 @@ #' Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'. #' Note 'seed' is not required or used in perf.mint.plsda as this method uses loo cross-validation #' @param ... not used +#' #' @return For PLS and sPLS models, \code{perf} produces a list with the #' following components for every repeat: #' \item{MSEP}{Mean Square Error Prediction for each \eqn{Y} variable, only @@ -141,19 +142,25 @@ #' \item{cor.tpred, cor.upred}{Correlation between the #' predicted and actual components for X (t) and Y (u)} #' \item{RSS.tpred, RSS.upred}{Residual Sum of Squares between the -#' predicted and actual components for X (t) and Y (u)} -#' \item{error.rate}{ For -#' PLS-DA and sPLS-DA models, \code{perf} produces a matrix of classification +#' predicted and actual components for X (t) and Y (u)} +#' +#' +#' +#' For PLS-DA and sPLS-DA models, \code{perf} produces a matrix of classification #' error rate estimation. The dimensions correspond to the components in the #' model and to the prediction method used, respectively. Note that error rates #' reported in any component include the performance of the model in earlier #' components for the specified \code{keepX} parameters (e.g. error rate #' reported for component 3 for \code{keepX = 20} already includes the fitted -#' model on components 1 and 2 for \code{keepX = 20}). For more advanced usage -#' of the \code{perf} function, see \url{www.mixomics.org/methods/spls-da/} and -#' consider using the \code{predict} function.} -#' \item{auc}{Averaged AUC values -#' over the \code{nrepeat}} +#' model on components 1 and 2 for \code{keepX = 20}). +#' \item{error.rate}{Prediction error rate for each dist and measure} +#' \item{auc}{AUC values per component averaged over the \code{nrepeat}} +#' \item{auc.all}{AUC values per component per repeat} +#' \item{predict}{A list of length ncomp that os predicted values of each sample for each class} +#' \item{features}{a list of features selected across the folds ($stable.X) for the keepX parameters from the input object.} +#' \item{choice.ncomp}{Otimal number of components for the model for each prediction distance using one-sided t-tests that test +#' for a significant difference in the mean error rate (gain in prediction) when components are added to the model.} +#' \item{class}{A list which gives the predicted class of each sample for each dist and each of the ncomp components} #' #' For mint.splsda models, \code{perf} produces the following outputs: #' \item{study.specific.error}{A list that gives BER, overall error rate and @@ -166,7 +173,7 @@ #' \item{auc}{AUC values} \item{auc.study}{AUC values for each study in mint #' models} #' -#' For sgccda models, \code{perf} produces the following outputs: +#' For sgccda models (i.e. block (s)PLS-DA models), \code{perf} produces the following outputs: #' \item{error.rate}{Prediction error rate for each block of \code{object$X} #' and each \code{dist}} \item{error.rate.per.class}{Prediction error rate for #' each block of \code{object$X}, each \code{dist} and each class} @@ -197,12 +204,14 @@ #' \item{WeightedVote.error.rate}{if more than one block, returns the error #' rate of the \code{WeightedVote} output} \item{weights}{Returns the weights #' of each block used for the weighted predictions, for each nrepeat and each -#' fold} \item{choice.ncomp}{For supervised models; returns the optimal number +#' fold} +#' \item{choice.ncomp}{For supervised models; returns the optimal number #' of components for the model for each prediction distance using one-sided #' t-tests that test for a significant difference in the mean error rate (gain #' in prediction) when components are added to the model. See more details in #' Rohart et al 2017 Suppl. For more than one block, an optimal ncomp is #' returned for each prediction framework.} +#' #' @author Ignacio González, Amrit Singh, Kim-Anh Lê Cao, Benoit Gautier, #' Florian Rohart, Al J Abadi #' @seealso \code{\link{predict}}, \code{\link{nipals}}, diff --git a/man/perf.Rd b/man/perf.Rd index 7f332e0f..2ce3eef8 100644 --- a/man/perf.Rd +++ b/man/perf.Rd @@ -185,19 +185,25 @@ if using standard (non-sparse) PLS.} \item{cor.tpred, cor.upred}{Correlation between the predicted and actual components for X (t) and Y (u)} \item{RSS.tpred, RSS.upred}{Residual Sum of Squares between the -predicted and actual components for X (t) and Y (u)} -\item{error.rate}{ For -PLS-DA and sPLS-DA models, \code{perf} produces a matrix of classification +predicted and actual components for X (t) and Y (u)} + + + +For PLS-DA and sPLS-DA models, \code{perf} produces a matrix of classification error rate estimation. The dimensions correspond to the components in the model and to the prediction method used, respectively. Note that error rates reported in any component include the performance of the model in earlier components for the specified \code{keepX} parameters (e.g. error rate reported for component 3 for \code{keepX = 20} already includes the fitted -model on components 1 and 2 for \code{keepX = 20}). For more advanced usage -of the \code{perf} function, see \url{www.mixomics.org/methods/spls-da/} and -consider using the \code{predict} function.} -\item{auc}{Averaged AUC values -over the \code{nrepeat}} +model on components 1 and 2 for \code{keepX = 20}). +\item{error.rate}{Prediction error rate for each dist and measure} +\item{auc}{AUC values per component averaged over the \code{nrepeat}} +\item{auc.all}{AUC values per component per repeat} +\item{predict}{A list of length ncomp that os predicted values of each sample for each class} +\item{features}{a list of features selected across the folds ($stable.X) for the keepX parameters from the input object.} +\item{choice.ncomp}{Otimal number of components for the model for each prediction distance using one-sided t-tests that test +for a significant difference in the mean error rate (gain in prediction) when components are added to the model.} +\item{class}{A list which gives the predicted class of each sample for each dist and each of the ncomp components} For mint.splsda models, \code{perf} produces the following outputs: \item{study.specific.error}{A list that gives BER, overall error rate and @@ -210,7 +216,7 @@ predicted class of each sample for each \code{dist} and each of the \item{auc}{AUC values} \item{auc.study}{AUC values for each study in mint models} -For sgccda models, \code{perf} produces the following outputs: +For sgccda models (i.e. block (s)PLS-DA models), \code{perf} produces the following outputs: \item{error.rate}{Prediction error rate for each block of \code{object$X} and each \code{dist}} \item{error.rate.per.class}{Prediction error rate for each block of \code{object$X}, each \code{dist} and each class} @@ -241,7 +247,8 @@ the predicted class for this particular sample over the blocks.} \item{WeightedVote.error.rate}{if more than one block, returns the error rate of the \code{WeightedVote} output} \item{weights}{Returns the weights of each block used for the weighted predictions, for each nrepeat and each -fold} \item{choice.ncomp}{For supervised models; returns the optimal number +fold} +\item{choice.ncomp}{For supervised models; returns the optimal number of components for the model for each prediction distance using one-sided t-tests that test for a significant difference in the mean error rate (gain in prediction) when components are added to the model. See more details in From 040a8448e6319495139037f5dbb019deb76e45c5 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 15:16:38 +1100 Subject: [PATCH 57/75] update perf.assess docs --- NAMESPACE | 12 ++--- R/perf.assess.R | 86 ++++++++++++++------------------- R/perf.assess.diablo.R | 1 - R/perf.assess.mint.plsda.R | 39 ++++++++++++++- R/perf.assess.pls.R | 2 + R/perf.assess.plsda.R | 3 +- man/perf.assess.Rd | 98 ++++++++++++++++---------------------- 7 files changed, 124 insertions(+), 117 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 24005b72..58e77579 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,12 +16,6 @@ S3method(circosPlot,block.plsda) S3method(circosPlot,block.spls) S3method(circosPlot,block.splsda) S3method(image,tune.rcc) -S3method(perf,assess.mint.plsda) -S3method(perf,assess.mint.splsda) -S3method(perf,assess.mixo_pls) -S3method(perf,assess.mixo_plsda) -S3method(perf,assess.mixo_spls) -S3method(perf,assess.mixo_splsda) S3method(perf,mint.pls) S3method(perf,mint.plsda) S3method(perf,mint.spls) @@ -31,6 +25,12 @@ S3method(perf,mixo_plsda) S3method(perf,mixo_spls) S3method(perf,mixo_splsda) S3method(perf,sgccda) +S3method(perf.assess,mint.plsda) +S3method(perf.assess,mint.splsda) +S3method(perf.assess,mixo_pls) +S3method(perf.assess,mixo_plsda) +S3method(perf.assess,mixo_spls) +S3method(perf.assess,mixo_splsda) S3method(perf.assess,sgccda) S3method(plot,pca) S3method(plot,perf.mint.plsda.mthd) diff --git a/R/perf.assess.R b/R/perf.assess.R index fb90d107..8c142839 100644 --- a/R/perf.assess.R +++ b/R/perf.assess.R @@ -34,16 +34,6 @@ #' that for PLS and sPLS objects, perf is performed on the pre-processed data #' after log ratio transform and multilevel analysis, if any. #' -#' Sparse methods. The sPLS, sPLS-DA and sgccda functions are run on several -#' and different subsets of data (the cross-folds) and will certainly lead to -#' different subset of selected features. Those are summarised in the output -#' \code{features$stable} (see output Value below) to assess how often the -#' variables are selected across all folds. Note that for PLS-DA and sPLS-DA -#' objects, perf is performed on the original data, i.e. before the -#' pre-processing step of the log ratio transform and multilevel analysis, if -#' any. In addition for these methods, the classification error rate is -#' averaged across all folds. -#' #' The mint.sPLS-DA function estimates errors based on Leave-one-group-out #' cross validation (where each levels of object$study is left out (and #' predicted) once) and provides study-specific outputs @@ -63,8 +53,7 @@ #' threshold based on distances (see \code{predict}) that optimally determine #' class membership of the samples tested. As such AUC and ROC are not needed #' to estimate the performance of the model. We provide those outputs as -#' complementary performance measures. See more details in our mixOmics -#' article. +#' complementary performance measures. #' #' Prediction distances. See details from \code{?predict}, and also our #' supplemental material in the mixOmics article. @@ -87,7 +76,7 @@ #' More details about the PLS modes in \code{?pls}. #' #' @param object object of class inherited from \code{"pls"}, \code{"plsda"}, -#' \code{"spls"}, \code{"splsda"} or \code{"mint.splsda"}. The function will +#' \code{"spls"}, \code{"splsda"}. \code{"sgccda"} or \code{"mint.splsda"}. The function will #' retrieve some key parameters stored in that object. #' @param validation a character string. What kind of (internal) validation to use, #' matching one of \code{"Mfold"} or \code{"loo"} (see below). Default is @@ -113,15 +102,16 @@ #' Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'. #' Note 'seed' is not required or used in perf.mint.plsda as this method uses loo cross-validation #' @param ... not used -#' @return For PLS and sPLS models, \code{perf} produces a list with the -#' following components for every repeat: + +#' @return For PLS and sPLS models: #' \item{MSEP}{Mean Square Error Prediction for each \eqn{Y} variable, only #' applies to object inherited from \code{"pls"}, and \code{"spls"}. Only #' available when in regression (s)PLS.} #' \item{RMSEP}{Root Mean Square Error Prediction for each \eqn{Y} variable, only #' applies to object inherited from \code{"pls"}, and \code{"spls"}. Only #' available when in regression (s)PLS.} -#' \item{R2}{a matrix of \eqn{R^2} values of the \eqn{Y}-variables. Only applies to object +#' \item{R2}{a matrix of \eqn{R^2} values of the \eqn{Y}-variables for models +#' with \eqn{1, \ldots ,}\code{ncomp} components, only applies to object #' inherited from \code{"pls"}, and \code{"spls"}. Only available when in #' regression (s)PLS.} #' \item{Q2}{if \eqn{Y} contains one variable, a vector of \eqn{Q^2} values @@ -129,41 +119,44 @@ #' Note that in the specific case of an sPLS model, it is better to have a look #' at the Q2.total criterion, only applies to object inherited from #' \code{"pls"}, and \code{"spls"}. Only available when in regression (s)PLS.} -#' \item{Q2.total}{a vector of \eqn{Q^2}-total values for model, only applies to object inherited from +#' \item{Q2.total}{a vector of \eqn{Q^2}-total values for models with \eqn{1, +#' \ldots ,}\code{ncomp} components, only applies to object inherited from #' \code{"pls"}, and \code{"spls"}. Available in both (s)PLS modes.} -#' \item{RSS}{Residual Sum of Squares across all selected features.} +#' \item{RSS}{Residual Sum of Squares across all selected features} #' \item{PRESS}{Predicted Residual Error Sum of Squares across all selected features} -#' \item{features}{a list of features selected across the -#' folds (\code{$stable.X} and \code{$stable.Y}) for the \code{keepX} and -#' \code{keepY} parameters from the input object. Note, this will be \code{NULL} -#' if using standard (non-sparse) PLS.} #' \item{cor.tpred, cor.upred}{Correlation between the #' predicted and actual components for X (t) and Y (u)} #' \item{RSS.tpred, RSS.upred}{Residual Sum of Squares between the -#' predicted and actual components for X (t) and Y (u)} -#' \item{error.rate}{ For -#' PLS-DA and sPLS-DA models, \code{perf} produces a matrix of classification -#' error rate estimation using overall and BER error rates across different distance methods. -#' Although error rates are only reported for the number of components used in the final model, -#' Note that are calculated including the performance of the model in a smaller number of -#' components for the specified \code{keepX} parameters (e.g. error rate -#' reported for component 3 for \code{keepX = 20} already includes the fitted -#' model on components 1 and 2 for \code{keepX = 20}). For more advanced usage -#' of the \code{perf} function, see \url{www.mixomics.org/methods/spls-da/} and -#' consider using the \code{predict} function.} -#' \item{auc}{Averaged AUC values -#' over the \code{nrepeat}} -#' -#' #' For sgccda models, \code{perf} produces the following outputs: +#' predicted and actual components for X (t) and Y (u)} +#' +#' +#' +#' For PLS-DA and sPLS-DA models: +#' \item{error.rate}{Prediction error rate for each dist and measure} +#' \item{auc}{AUC value averaged over the \code{nrepeat}} +#' \item{auc.all}{AUC values per repeat} +#' \item{predict}{Predicted values of each sample for each class} +#' \item{class}{A list which gives the predicted class of each sample for each dist and each of the ncomp components} +#' +#' For mint.splsda models: +#' \item{study.specific.error}{A list that gives BER, overall error rate and +#' error rate per class, for each study} +#' \item{global.error}{A list that gives +#' BER, overall error rate and error rate per class for all samples} +#' \item{predict}{A list of length \code{ncomp} that produces the predicted +#' values of each sample for each class} +#' \item{class}{A list which gives the +#' predicted class of each sample for each \code{dist}.} +#' \item{auc}{AUC values} \item{auc.study}{AUC values for each study in mint models} +#' +#' For sgccda models (i.e. block (s)PLS-DA models): #' \item{error.rate}{Prediction error rate for each block of \code{object$X} #' and each \code{dist}} #' \item{error.rate.per.class}{Prediction error rate for #' each block of \code{object$X}, each \code{dist} and each class} -#' \item{predict}{Predicted values of each sample for each class and each block.} -#' \item{class}{Predicted class of each sample for each block, each \code{dist}, and each nrepeat} -#' \item{features}{a list of features selected across the folds (\code{$stable.X} and -#' \code{$stable.Y}) for the \code{keepX} and \code{keepY} parameters from the -#' input object.} +#' \item{predict}{Predicted values of each sample for each class and each block} +#' \item{class}{Predicted class of each sample for each +#' block, each \code{dist}, and each nrepeat} #' \item{AveragedPredict.class}{if more than one block, returns #' the average predicted class over the blocks (averaged of the \code{Predict} #' output and prediction using the \code{max.dist} distance)} @@ -187,15 +180,6 @@ #' rate of the \code{WeightedVote} output} #' \item{weights}{Returns the weights of each block used for the weighted predictions, for each nrepeat and each #' fold} -#' -#' For mint.splsda models, \code{perf} produces the following outputs: -#' \item{study.specific.error}{A list that gives BER, overall error rate and -#' error rate per class, for each study} -#' \item{global.error}{A list that gives BER, overall error rate and error rate per class for all samples} -#' \item{predict}{A list of the predicted values of each sample for each class} -#' \item{class}{A list which gives the predicted class of each sample for each \code{dist}. Directly obtained from the \code{predict} output.} -#' \item{auc}{AUC values} \item{auc.study}{AUC values for each study} -#' #' @author Ignacio González, Amrit Singh, Kim-Anh Lê Cao, Benoit Gautier, #' Florian Rohart, Al J Abadi diff --git a/R/perf.assess.diablo.R b/R/perf.assess.diablo.R index 1673d75e..65bc6723 100644 --- a/R/perf.assess.diablo.R +++ b/R/perf.assess.diablo.R @@ -33,7 +33,6 @@ # folds - number of folds if validation = "Mfold" # ---------------------------------------------------------------------------------------------------------- #' @rdname perf.assess -#' @importFrom utils relist #' @method perf.assess sgccda #' @export perf.assess.sgccda <- diff --git a/R/perf.assess.mint.plsda.R b/R/perf.assess.mint.plsda.R index 4863d3e7..a635d502 100644 --- a/R/perf.assess.mint.plsda.R +++ b/R/perf.assess.mint.plsda.R @@ -1,5 +1,41 @@ -## -------------------------- perf.mint(s)plsda --------------------------- ## +############################################################################################################# +# Authors: +# Amrit Singh, University of British Columbia, Vancouver. +# Florian Rohart, The University of Queensland, The University of Queensland Diamantina Institute, Translational Research Institute, Brisbane, QLD +# Kim-Anh Le Cao, The University of Queensland, The University of Queensland Diamantina Institute, Translational Research Institute, Brisbane, QLD +# +# created: 01-04-2015 +# last modified: 27-05-2016 +# +# Copyright (C) 2015 +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +############################################################################################################# + + +# ---------------------------------------------------------------------------------------------------------- +# perf.assess.mint.plsda - Function to evaluate the performance of the fitted PLS (cross-validation) +# inputs: object - object obtain from running mint.plsda +# dist - to evaluate the classification performance +# validation - type of validation +# folds - number of folds if validation = "Mfold" +# ---------------------------------------------------------------------------------------------------------- +#' ## -------------------------- perf.mint(s)plsda --------------------------- ## + #' @rdname perf.assess +#' @method perf.assess mint.plsda #' @export perf.assess.mint.plsda <- function (object, dist = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"), @@ -259,5 +295,6 @@ perf.assess.mint.plsda <- function (object, } #' @rdname perf.assess +#' @method perf.assess mint.splsda #' @export perf.assess.mint.splsda <- perf.assess.mint.plsda \ No newline at end of file diff --git a/R/perf.assess.pls.R b/R/perf.assess.pls.R index 01ef7f5b..1fb16737 100644 --- a/R/perf.assess.pls.R +++ b/R/perf.assess.pls.R @@ -31,6 +31,7 @@ ## -------------------------------- (s)PLS -------------------------------- ## #' @rdname perf.assess +#' @method perf.assess mixo_pls #' @export perf.assess.mixo_pls <- function(object, validation = c("Mfold", "loo"), @@ -133,6 +134,7 @@ perf.assess.mixo_pls <- function(object, } #' @rdname perf.assess +#' @method perf.assess mixo_spls #' @export perf.assess.mixo_spls <- perf.assess.mixo_pls diff --git a/R/perf.assess.plsda.R b/R/perf.assess.plsda.R index 4485e9c1..fa477a2e 100644 --- a/R/perf.assess.plsda.R +++ b/R/perf.assess.plsda.R @@ -31,7 +31,7 @@ ## ------------------------------- (s)PLSDA ------------------------------- ## #' @rdname perf.assess -#' @importFrom methods hasArg +#' @method perf.assess mixo_plsda #' @export perf.assess.mixo_plsda <- function(object, dist = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"), @@ -328,5 +328,6 @@ perf.assess.mixo_plsda <- function(object, } #' @rdname perf.assess +#' @method perf.assess mixo_splsda #' @export perf.assess.mixo_splsda <- perf.assess.mixo_plsda diff --git a/man/perf.assess.Rd b/man/perf.assess.Rd index 8f260eaa..13564f31 100644 --- a/man/perf.assess.Rd +++ b/man/perf.assess.Rd @@ -28,7 +28,7 @@ perf.assess(object, ...) ... ) -\method{perf}{assess.mint.plsda}( +\method{perf.assess}{mint.plsda}( object, dist = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"), auc = FALSE, @@ -37,7 +37,7 @@ perf.assess(object, ...) ... ) -\method{perf}{assess.mint.splsda}( +\method{perf.assess}{mint.splsda}( object, dist = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"), auc = FALSE, @@ -46,7 +46,7 @@ perf.assess(object, ...) ... ) -\method{perf}{assess.mixo_pls}( +\method{perf.assess}{mixo_pls}( object, validation = c("Mfold", "loo"), folds, @@ -57,7 +57,7 @@ perf.assess(object, ...) ... ) -\method{perf}{assess.mixo_spls}( +\method{perf.assess}{mixo_spls}( object, validation = c("Mfold", "loo"), folds, @@ -68,7 +68,7 @@ perf.assess(object, ...) ... ) -\method{perf}{assess.mixo_plsda}( +\method{perf.assess}{mixo_plsda}( object, dist = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"), validation = c("Mfold", "loo"), @@ -82,7 +82,7 @@ perf.assess(object, ...) ... ) -\method{perf}{assess.mixo_splsda}( +\method{perf.assess}{mixo_splsda}( object, dist = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"), validation = c("Mfold", "loo"), @@ -98,7 +98,7 @@ perf.assess(object, ...) } \arguments{ \item{object}{object of class inherited from \code{"pls"}, \code{"plsda"}, -\code{"spls"}, \code{"splsda"} or \code{"mint.splsda"}. The function will +\code{"spls"}, \code{"splsda"}. \code{"sgccda"} or \code{"mint.splsda"}. The function will retrieve some key parameters stored in that object.} \item{...}{not used} @@ -137,15 +137,15 @@ Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM' Note 'seed' is not required or used in perf.mint.plsda as this method uses loo cross-validation} } \value{ -For PLS and sPLS models, \code{perf} produces a list with the -following components for every repeat: +For PLS and sPLS models: \item{MSEP}{Mean Square Error Prediction for each \eqn{Y} variable, only applies to object inherited from \code{"pls"}, and \code{"spls"}. Only available when in regression (s)PLS.} \item{RMSEP}{Root Mean Square Error Prediction for each \eqn{Y} variable, only applies to object inherited from \code{"pls"}, and \code{"spls"}. Only available when in regression (s)PLS.} -\item{R2}{a matrix of \eqn{R^2} values of the \eqn{Y}-variables. Only applies to object +\item{R2}{a matrix of \eqn{R^2} values of the \eqn{Y}-variables for models +with \eqn{1, \ldots ,}\code{ncomp} components, only applies to object inherited from \code{"pls"}, and \code{"spls"}. Only available when in regression (s)PLS.} \item{Q2}{if \eqn{Y} contains one variable, a vector of \eqn{Q^2} values @@ -153,41 +153,44 @@ else a list with a matrix of \eqn{Q^2} values for each \eqn{Y}-variable. Note that in the specific case of an sPLS model, it is better to have a look at the Q2.total criterion, only applies to object inherited from \code{"pls"}, and \code{"spls"}. Only available when in regression (s)PLS.} -\item{Q2.total}{a vector of \eqn{Q^2}-total values for model, only applies to object inherited from +\item{Q2.total}{a vector of \eqn{Q^2}-total values for models with \eqn{1, +\ldots ,}\code{ncomp} components, only applies to object inherited from \code{"pls"}, and \code{"spls"}. Available in both (s)PLS modes.} -\item{RSS}{Residual Sum of Squares across all selected features.} +\item{RSS}{Residual Sum of Squares across all selected features} \item{PRESS}{Predicted Residual Error Sum of Squares across all selected features} -\item{features}{a list of features selected across the -folds (\code{$stable.X} and \code{$stable.Y}) for the \code{keepX} and -\code{keepY} parameters from the input object. Note, this will be \code{NULL} -if using standard (non-sparse) PLS.} \item{cor.tpred, cor.upred}{Correlation between the predicted and actual components for X (t) and Y (u)} \item{RSS.tpred, RSS.upred}{Residual Sum of Squares between the -predicted and actual components for X (t) and Y (u)} -\item{error.rate}{ For -PLS-DA and sPLS-DA models, \code{perf} produces a matrix of classification -error rate estimation using overall and BER error rates across different distance methods. -Although error rates are only reported for the number of components used in the final model, -Note that are calculated including the performance of the model in a smaller number of -components for the specified \code{keepX} parameters (e.g. error rate -reported for component 3 for \code{keepX = 20} already includes the fitted -model on components 1 and 2 for \code{keepX = 20}). For more advanced usage -of the \code{perf} function, see \url{www.mixomics.org/methods/spls-da/} and -consider using the \code{predict} function.} -\item{auc}{Averaged AUC values -over the \code{nrepeat}} - -#' For sgccda models, \code{perf} produces the following outputs: +predicted and actual components for X (t) and Y (u)} + + + +For PLS-DA and sPLS-DA models: +\item{error.rate}{Prediction error rate for each dist and measure} +\item{auc}{AUC value averaged over the \code{nrepeat}} +\item{auc.all}{AUC values per repeat} +\item{predict}{Predicted values of each sample for each class} +\item{class}{A list which gives the predicted class of each sample for each dist and each of the ncomp components} + +For mint.splsda models: +\item{study.specific.error}{A list that gives BER, overall error rate and +error rate per class, for each study} +\item{global.error}{A list that gives +BER, overall error rate and error rate per class for all samples} +\item{predict}{A list of length \code{ncomp} that produces the predicted +values of each sample for each class} +\item{class}{A list which gives the +predicted class of each sample for each \code{dist}.} +\item{auc}{AUC values} \item{auc.study}{AUC values for each study in mint models} + +For sgccda models (i.e. block (s)PLS-DA models): \item{error.rate}{Prediction error rate for each block of \code{object$X} and each \code{dist}} \item{error.rate.per.class}{Prediction error rate for each block of \code{object$X}, each \code{dist} and each class} -\item{predict}{Predicted values of each sample for each class and each block.} -\item{class}{Predicted class of each sample for each block, each \code{dist}, and each nrepeat} -\item{features}{a list of features selected across the folds (\code{$stable.X} and -\code{$stable.Y}) for the \code{keepX} and \code{keepY} parameters from the -input object.} +\item{predict}{Predicted values of each sample for each class and each block} +\item{class}{Predicted class of each sample for each +block, each \code{dist}, and each nrepeat} \item{AveragedPredict.class}{if more than one block, returns the average predicted class over the blocks (averaged of the \code{Predict} output and prediction using the \code{max.dist} distance)} @@ -210,15 +213,7 @@ the predicted class for this particular sample over the blocks.} \item{WeightedVote.error.rate}{if more than one block, returns the error rate of the \code{WeightedVote} output} \item{weights}{Returns the weights of each block used for the weighted predictions, for each nrepeat and each -fold} - -For mint.splsda models, \code{perf} produces the following outputs: -\item{study.specific.error}{A list that gives BER, overall error rate and -error rate per class, for each study} -\item{global.error}{A list that gives BER, overall error rate and error rate per class for all samples} -\item{predict}{A list of the predicted values of each sample for each class} -\item{class}{A list which gives the predicted class of each sample for each \code{dist}. Directly obtained from the \code{predict} output.} -\item{auc}{AUC values} \item{auc.study}{AUC values for each study} +fold} } \description{ Function to evaluate the performance of the fitted PLS, sparse PLS, PLS-DA, @@ -256,16 +251,6 @@ MSEP, \eqn{R^2}, and \eqn{Q^2} criteria are averaged across all folds. Note that for PLS and sPLS objects, perf is performed on the pre-processed data after log ratio transform and multilevel analysis, if any. -Sparse methods. The sPLS, sPLS-DA and sgccda functions are run on several -and different subsets of data (the cross-folds) and will certainly lead to -different subset of selected features. Those are summarised in the output -\code{features$stable} (see output Value below) to assess how often the -variables are selected across all folds. Note that for PLS-DA and sPLS-DA -objects, perf is performed on the original data, i.e. before the -pre-processing step of the log ratio transform and multilevel analysis, if -any. In addition for these methods, the classification error rate is -averaged across all folds. - The mint.sPLS-DA function estimates errors based on Leave-one-group-out cross validation (where each levels of object$study is left out (and predicted) once) and provides study-specific outputs @@ -285,8 +270,7 @@ details. Our multivariate supervised methods already use a prediction threshold based on distances (see \code{predict}) that optimally determine class membership of the samples tested. As such AUC and ROC are not needed to estimate the performance of the model. We provide those outputs as -complementary performance measures. See more details in our mixOmics -article. +complementary performance measures. Prediction distances. See details from \code{?predict}, and also our supplemental material in the mixOmics article. From 2f445ad349f0015c48e922421db6eecb5b19b1c0 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 15:31:41 +1100 Subject: [PATCH 58/75] add examples to perf.assess --- R/perf.assess.R | 2 +- examples/perf.assess-examples.R | 93 +++++++++++++ man/perf.assess.Rd | 222 +++++++++++++------------------- 3 files changed, 184 insertions(+), 133 deletions(-) create mode 100644 examples/perf.assess-examples.R diff --git a/R/perf.assess.R b/R/perf.assess.R index 8c142839..767283fa 100644 --- a/R/perf.assess.R +++ b/R/perf.assess.R @@ -231,7 +231,7 @@ #' 4:e1845. #' @keywords regression multivariate #' @export -#' @example ./examples/perf-examples.R +#' @example ./examples/perf.assess-examples.R ## ------------------------------- Generic -------------------------------- ## perf.assess <- function(object, ...) UseMethod("perf.assess") \ No newline at end of file diff --git a/examples/perf.assess-examples.R b/examples/perf.assess-examples.R new file mode 100644 index 00000000..db325574 --- /dev/null +++ b/examples/perf.assess-examples.R @@ -0,0 +1,93 @@ +## PLS-DA example + +data(liver.toxicity) # rats gex and clinical measurements/treatments +unique(liver.toxicity$treatment$Treatment.Group) # 16 groups +length(liver.toxicity$treatment$Treatment.Group) # 64 samples + +plsda.res <- plsda(liver.toxicity$gene, liver.toxicity$treatment$Treatment.Group, ncomp = 2) + +performance <- perf.assess(plsda.res, + validation = "Mfold", folds = 3, nrepeat = 10, # to make sure each fold has all classes represented + seed = 12) # for reproducibility, remove for analysis + +performance$error.rate$BER + +## sPLS-DA example + +splsda.res <- splsda(liver.toxicity$gene, liver.toxicity$treatment$Treatment.Group, + keepX = c(25, 25), ncomp = 2) + +performance <- perf.assess(splsda.res, + validation = "Mfold", folds = 3, nrepeat = 10, # to make sure each fold has all classes represented + seed = 12) + +performance$error.rate$BER # can see slight improvement in error rate over PLS-DA example + +## PLS example + +ncol(liver.toxicity$clinic) # 10 Y variables as output of PLS model + +pls.res <- pls(liver.toxicity$gene, liver.toxicity$clinic, ncomp = 2) + +performance <- perf.assess(pls.res, + validation = "Mfold", folds = 3, nrepeat = 10, # to make sure each fold has all classes represented + seed = 12) + +performance$measures$Q2$summary # see Q2 which gives indication of predictive ability for each of the 10 Y outputs + +## sPLS example + +spls.res <- spls(liver.toxicity$gene, liver.toxicity$clinic, ncomp = 2, keepX = c(50, 50)) + +performance <- perf.assess(spls.res, + validation = "Mfold", folds = 3, nrepeat = 10, # to make sure each fold has all classes represented + seed = 12) + +performance$measures$Q2$summary # see Q2 which gives indication of predictive ability for each of the 10 Y outputs + + +## block PLS-DA example + +data("breast.TCGA") +mrna <- breast.TCGA$data.train$mrna +mirna <- breast.TCGA$data.train$mirna +data <- list(mrna = mrna, mirna = mirna) +design <- matrix(1, ncol = length(data), nrow = length(data), dimnames = list(names(data), names(data))) +diag(design) <- 0 + +block.plsda.res <- block.plsda(X = data, Y = breast.TCGA$data.train$subtype, + ncomp = 2, design = design) + +performance <- perf.assess(block.plsda.res) + +performance$error.rate.per.class # error rate per class per distance metric + +## block sPLS-DA example + +block.splsda.res <- block.splsda(X = data, Y = breast.TCGA$data.train$subtype, + ncomp = 2, design = design, + keepX = list(mrna = c(8,8), mirna = c(8,8))) + +performance <- perf.assess(block.splsda.res) + +performance$error.rate.per.class + +## MINT PLS-DA example + +data("stemcells") + +mint.plsda.res <- mint.plsda(X = stemcells$gene, Y = stemcells$celltype, ncomp = 3, + study = stemcells$study) + +performance <- perf.assess(mint.plsda.res) + +performance$global.error$BER # global error per distance metric + +## MINT sPLS-DA example + +mint.splsda.res <- mint.splsda(X = stemcells$gene, Y = stemcells$celltype, ncomp = 3, + keepX = c(10, 5, 15), study = stemcells$study) + +performance <- perf.assess(mint.splsda.res) + +performance$global.error$BER # error slightly higher in this sparse model verses non-sparse \ No newline at end of file diff --git a/man/perf.assess.Rd b/man/perf.assess.Rd index 13564f31..914b96fa 100644 --- a/man/perf.assess.Rd +++ b/man/perf.assess.Rd @@ -293,141 +293,99 @@ given block with the outcome variable Y. More details about the PLS modes in \code{?pls}. } \examples{ -## validation for objects of class 'pls' (regression) -# ---------------------------------------- -data(liver.toxicity) -X <- liver.toxicity$gene -Y <- liver.toxicity$clinic - -# try tune the number of component to choose -# --------------------- -# first learn the full model -liver.pls <- pls(X, Y, ncomp = 5) - -# with 5-fold cross validation: we use the same parameters as in model above -# but we perform cross validation to compute the MSEP, Q2 and R2 criteria -# --------------------------- -liver.val <- perf(liver.pls, validation = "Mfold", folds = 5) - -# see available criteria -names(liver.val$measures) -# see values for all repeats -liver.val$measures$Q2.total$values -# see summary over repeats -liver.val$measures$Q2.total$summary -# Q2 total should decrease until it reaches a threshold -liver.val$measures$Q2.total - -# ncomp = 2 is enough -plot(liver.val, criterion = 'Q2.total') - -\dontrun{ - -# have a look at the other criteria -# ---------------------- -# R2 -plot(liver.val, criterion = 'R2') -## correlation of components (see docs) -plot(liver.val, criterion = 'cor.tpred') - -# MSEP -plot(liver.val, criterion = 'MSEP') -## validation for objects of class 'spls' (regression) -# ---------------------------------------- -ncomp = 7 -# first, learn the model on the whole data set -model.spls = spls(X, Y, ncomp = ncomp, mode = 'regression', - keepX = c(rep(10, ncomp)), keepY = c(rep(4,ncomp))) - - -# with leave-one-out cross validation -model.spls.val <- perf(model.spls, validation = "Mfold", folds = 5, seed = 45 ) - -#Q2 total -model.spls.val$measures$Q2$summary - -# R2: we can see how the performance degrades when ncomp increases -plot(model.spls.val, criterion="R2") - -## validation for objects of class 'splsda' (classification) -# ---------------------------------------- -data(srbct) -X <- srbct$gene -Y <- srbct$class - -ncomp = 2 - -srbct.splsda <- splsda(X, Y, ncomp = ncomp, keepX = rep(10, ncomp)) - -# with Mfold -# --------- -error <- perf(srbct.splsda, validation = "Mfold", folds = 8, -dist = "all", auc = TRUE, seed = 45) -error -error$auc - -plot(error) - -# parallel code -library(BiocParallel) -error <- perf(srbct.splsda, validation = "Mfold", folds = 8, -dist = "all", auc = TRUE, BPPARAM = SnowParam(workers = 2), seed = 45) - -# with 5 components and nrepeat=5, to get a $choice.ncomp -ncomp = 5 -srbct.splsda <- splsda(X, Y, ncomp = ncomp, keepX = rep(10, ncomp)) - -error <- perf(srbct.splsda, validation = "Mfold", folds = 8, -dist = "all", nrepeat = 5, seed = 45) -error$choice.ncomp - -plot(error) - - -## validation for objects of class 'mint.splsda' (classification) -# ---------------------------------------- - -data(stemcells) -res = mint.splsda(X = stemcells$gene, Y = stemcells$celltype, - ncomp = 3, keepX = c(10, 5, 15), - study = stemcells$study) - -out = perf(res, auc = TRUE) -out -plot(out) -out$auc -out$auc.study - -## validation for objects of class 'sgccda' (classification) -# ---------------------------------------- - -data(nutrimouse) -Y = nutrimouse$diet -data = list(gene = nutrimouse$gene, lipid = nutrimouse$lipid) - -nutrimouse.sgccda <- block.splsda(X=data, -Y = Y, -design = 'full', -keepX = list(gene=c(10,10), lipid=c(15,15)), -ncomp = 2) - -perf = perf(nutrimouse.sgccda) -perf -plot(perf) +## PLS-DA example +data(liver.toxicity) # rats gex and clinical measurements/treatments +unique(liver.toxicity$treatment$Treatment.Group) # 16 groups +length(liver.toxicity$treatment$Treatment.Group) # 64 samples -# with 5 components and nrepeat=5 to get $choice.ncomp -nutrimouse.sgccda <- block.splsda(X=data, -Y = Y, -design = 'full', -keepX = list(gene=c(10,10), lipid=c(15,15)), -ncomp = 5) +plsda.res <- plsda(liver.toxicity$gene, liver.toxicity$treatment$Treatment.Group, ncomp = 2) -perf = perf(nutrimouse.sgccda, folds = 5, nrepeat = 5) -perf -plot(perf) -perf$choice.ncomp -} +performance <- perf.assess(plsda.res, + validation = "Mfold", folds = 3, nrepeat = 10, # to make sure each fold has all classes represented + seed = 12) # for reproducibility, remove for analysis + +performance$error.rate$BER + +## sPLS-DA example + +splsda.res <- splsda(liver.toxicity$gene, liver.toxicity$treatment$Treatment.Group, + keepX = c(25, 25), ncomp = 2) + +performance <- perf.assess(splsda.res, + validation = "Mfold", folds = 3, nrepeat = 10, # to make sure each fold has all classes represented + seed = 12) + +performance$error.rate$BER # can see slight improvement in error rate over PLS-DA example + +## PLS example + +ncol(liver.toxicity$clinic) # 10 Y variables as output of PLS model + +pls.res <- pls(liver.toxicity$gene, liver.toxicity$clinic, ncomp = 2) + +performance <- perf.assess(pls.res, + validation = "Mfold", folds = 3, nrepeat = 10, # to make sure each fold has all classes represented + seed = 12) + +performance$measures$Q2$summary # see Q2 which gives indication of predictive ability for each of the 10 Y outputs + +## sPLS example + +spls.res <- spls(liver.toxicity$gene, liver.toxicity$clinic, ncomp = 2, keepX = c(50, 50)) + +performance <- perf.assess(spls.res, + validation = "Mfold", folds = 3, nrepeat = 10, # to make sure each fold has all classes represented + seed = 12) + +performance$measures$Q2$summary # see Q2 which gives indication of predictive ability for each of the 10 Y outputs + + +## block PLS-DA example + +data("breast.TCGA") +mrna <- breast.TCGA$data.train$mrna +mirna <- breast.TCGA$data.train$mirna +data <- list(mrna = mrna, mirna = mirna) +design <- matrix(1, ncol = length(data), nrow = length(data), dimnames = list(names(data), names(data))) +diag(design) <- 0 + +block.plsda.res <- block.plsda(X = data, Y = breast.TCGA$data.train$subtype, + ncomp = 2, design = design) + +performance <- perf.assess(block.plsda.res) + +performance$error.rate.per.class # error rate per class per distance metric + +## block sPLS-DA example + +block.splsda.res <- block.splsda(X = data, Y = breast.TCGA$data.train$subtype, + ncomp = 2, design = design, + keepX = list(mrna = c(8,8), mirna = c(8,8))) + +performance <- perf.assess(block.splsda.res) + +performance$error.rate.per.class + +## MINT PLS-DA example + +data("stemcells") + +mint.plsda.res <- mint.plsda(X = stemcells$gene, Y = stemcells$celltype, ncomp = 3, + study = stemcells$study) + +performance <- perf.assess(mint.plsda.res) + +performance$global.error$BER # global error per distance metric + +## MINT sPLS-DA example + +mint.splsda.res <- mint.splsda(X = stemcells$gene, Y = stemcells$celltype, ncomp = 3, + keepX = c(10, 5, 15), study = stemcells$study) + +performance <- perf.assess(mint.splsda.res) + +performance$global.error$BER # error slightly higher in this sparse model verses non-sparse } \references{ Singh A., Shannon C., Gautier B., Rohart F., Vacher M., Tebbutt S. From 5051a357f8985b18d0566b65d9d75af2b84ae65d Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 15:40:24 +1100 Subject: [PATCH 59/75] add unit tests for tune.plsda --- tests/testthat/test-tune.plsda.R | 45 +++++++++++++++++++++++++++++++ tests/testthat/test-tune.splsda.R | 2 +- 2 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 tests/testthat/test-tune.plsda.R diff --git a/tests/testthat/test-tune.plsda.R b/tests/testthat/test-tune.plsda.R new file mode 100644 index 00000000..f66c1ba5 --- /dev/null +++ b/tests/testthat/test-tune.plsda.R @@ -0,0 +1,45 @@ +context("tune.plsda") +library(BiocParallel) + +test_that("tune.plsda works and is the same perf alone and in tune wrapper", code = { + + # set up data + data(breast.tumors) + X = breast.tumors$gene.exp + Y = as.factor(breast.tumors$sample$treatment) + + # run alone + tune.plsda.res.1 = suppressWarnings( + tune.splsda(X, Y, ncomp = 2, nrepeat = 1, logratio = "none", + folds = 2, + BPPARAM = SerialParam(), seed = 42) + ) + + # run in wrapper + tune.plsda.res.2 = suppressWarnings( + tune(X, Y, ncomp = 2, nrepeat = 1, logratio = "none", + folds = 2, + BPPARAM = SnowParam(workers = 2), seed = 42, + method = "plsda") + ) + + # run perf + model <- plsda(X, Y, ncomp = 2, logratio = "none") + tune.plsda.res.3 = suppressWarnings( + perf(model, ncomp = 2, nrepeat = 1, + folds = 2, + BPPARAM = SerialParam(), seed = 42, + method = "splsda") + ) + + + # check outputs format + expect_equal(class(tune.plsda.res.1)[1], "perf") + expect_equal(class(tune.plsda.res.2)[1], "perf") + expect_equal(class(tune.plsda.res.3)[1], "perf") + # check outputs values + .expect_numerically_close(tune.plsda.res.1$error.rate$overall[1,1], 0.5106383) + .expect_numerically_close(tune.plsda.res.2$error.rate$overall[1,1], 0.5106383) + .expect_numerically_close(tune.plsda.res.3$error.rate$overall[1,1], 0.5106383) + +}) \ No newline at end of file diff --git a/tests/testthat/test-tune.splsda.R b/tests/testthat/test-tune.splsda.R index 516b10d6..7528a032 100644 --- a/tests/testthat/test-tune.splsda.R +++ b/tests/testthat/test-tune.splsda.R @@ -1,7 +1,7 @@ context("tune.splsda") library(BiocParallel) -test_that("tune.spls works and is the same in parallel and when run in tune wrapper", code = { +test_that("tune.splsda works and is the same in parallel and when run in tune wrapper", code = { # set up data data(breast.tumors) From 582271021a60c3d8a8f6478a26ac85938825d4b9 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 15:47:15 +1100 Subject: [PATCH 60/75] unit tests for tune.pls --- tests/testthat/test-tune.pls.R | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 tests/testthat/test-tune.pls.R diff --git a/tests/testthat/test-tune.pls.R b/tests/testthat/test-tune.pls.R new file mode 100644 index 00000000..d596e0e0 --- /dev/null +++ b/tests/testthat/test-tune.pls.R @@ -0,0 +1,34 @@ +context("tune.pls") +library(BiocParallel) + +test_that("tune.pls works and is the same as perf alone and in tune wrapper", code = { + + # set up data + data(liver.toxicity) + X <- liver.toxicity$gene + Y <- liver.toxicity$clinic + + # independently + tune.res.1 <- tune.pls( X, Y, ncomp = 2, measure = "cor", logratio = "none", + folds = 5, nrepeat = 1, progressBar = FALSE, seed = 100) + + # in tune wrapper + tune.res.2 <- tune( X, Y, ncomp = 2, measure = "cor", logratio = "none", + folds = 5, nrepeat = 1, method = "pls", seed = 100) + + # with perf + model <- pls(X, Y, ncomp = 10, logratio = "none") + tune.res.3 <- perf(model, folds = 5, nrepeat = 1, seed = 100) + + # check outputs format + expect_equal(class(tune.res.1)[1], "perf.pls.mthd") + expect_equal(class(tune.res.2)[1], "perf.pls.mthd") + expect_equal(class(tune.res.3)[1], "perf.pls.mthd") + + # check output values + .expect_numerically_close(tune.res.1$measures$Q2$summary[1,3], -0.1304012) + .expect_numerically_close(tune.res.2$measures$Q2$summary[1,3], -0.1304012) + .expect_numerically_close(tune.res.3$measures$Q2$summary[1,3], -0.1304012) + +}) + \ No newline at end of file From 834f332ba54a7c4cf32022550263d07a105e4b19 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 16:28:50 +1100 Subject: [PATCH 61/75] make sure tune.spls and tune wrapper match default args --- R/tune.spls.R | 2 +- man/tune.spls.Rd | 2 +- tests/testthat/test-tune.spls.R | 18 +++++++++++------- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/R/tune.spls.R b/R/tune.spls.R index 3821f833..652111e5 100644 --- a/R/tune.spls.R +++ b/R/tune.spls.R @@ -163,7 +163,7 @@ tune.spls <- mode = c('regression', 'canonical', 'classic'), scale = TRUE, logratio = "none", - tol = 1e-06, + tol = 1e-09, max.iter = 100, near.zero.var = FALSE, multilevel = NULL, diff --git a/man/tune.spls.Rd b/man/tune.spls.Rd index 5bb87b5d..f18b866f 100644 --- a/man/tune.spls.Rd +++ b/man/tune.spls.Rd @@ -13,7 +13,7 @@ tune.spls( mode = c("regression", "canonical", "classic"), scale = TRUE, logratio = "none", - tol = 1e-06, + tol = 1e-09, max.iter = 100, near.zero.var = FALSE, multilevel = NULL, diff --git a/tests/testthat/test-tune.spls.R b/tests/testthat/test-tune.spls.R index a2773e01..7b923237 100644 --- a/tests/testthat/test-tune.spls.R +++ b/tests/testthat/test-tune.spls.R @@ -10,24 +10,27 @@ test_that("tune.spls works and is the same in parallel and when run in tune wrap # run in serial tune.spls.res.1 = suppressWarnings(tune.spls(X, Y, ncomp = 2, - test.keepX = seq(5, 10, 5), + test.keepX = seq(5, 10, 5), test.keepY = seq(3, 6, 3), measure = "cor", + logratio = "none", mode = "regression", folds = 2, nrepeat = 1, progressBar = FALSE, BPPARAM = SerialParam(RNGseed = 5), # RNGseed is ignored seed = 5212)) # run in parallel tune.spls.res.2 = suppressWarnings(tune.spls(X, Y, ncomp = 2, - test.keepX = seq(5, 10, 5), + test.keepX = seq(5, 10, 5), test.keepY = seq(3, 6, 3), measure = "cor", + logratio = "none", mode = "regression", folds = 2, nrepeat = 1, progressBar = FALSE, BPPARAM = SnowParam(workers = 2), seed = 5212)) # in tune wrapper in serial tune.spls.res.3 = suppressWarnings(tune(X, Y, ncomp = 2, - test.keepX = seq(5, 10, 5), + test.keepX = seq(5, 10, 5), test.keepY = seq(3, 6, 3), measure = "cor", + logratio = "none", mode = "regression", folds = 2, nrepeat = 1, progressBar = FALSE, BPPARAM = SerialParam(RNGseed = NULL), seed = 5212, @@ -36,8 +39,9 @@ test_that("tune.spls works and is the same in parallel and when run in tune wrap # in tune wrapper in parallel tune.spls.res.4 = suppressWarnings(tune(X, Y, ncomp = 2, - test.keepX = seq(5, 10, 5), + test.keepX = seq(5, 10, 5), test.keepY = seq(3, 6, 3), measure = "cor", + logratio = "none", mode = "regression", folds = 2, nrepeat = 1, progressBar = FALSE, BPPARAM = SnowParam(workers = 2), method = "spls", @@ -57,9 +61,9 @@ test_that("tune.spls works and is the same in parallel and when run in tune wrap expect_equal(unname(tune.spls.res.4$choice.keepY), c(3,6)) # check outputs exactly the same regardless of how the function was run - expect_equal(tune.spls.res.1$measure.pred$mean, tune.spls.res.2$measure.pred$mean) - expect_equal(tune.spls.res.1$measure.pred$mean, tune.spls.res.3$measure.pred$mean) - expect_equal(tune.spls.res.1$measure.pred$mean, tune.spls.res.4$measure.pred$mean) + .expect_numerically_close(tune.spls.res.1$measure.pred$mean, tune.spls.res.2$measure.pred$mean) + .expect_numerically_close(tune.spls.res.1$measure.pred$mean, tune.spls.res.3$measure.pred$mean) + .expect_numerically_close(tune.spls.res.1$measure.pred$mean, tune.spls.res.4$measure.pred$mean) }) ## If ncol(Y) == 1 tune.spls calls tune.spls1 From 92a64574ca1b9bd0fd1d558cf569ad3b68b71553 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 16:29:02 +1100 Subject: [PATCH 62/75] add unit tests for tune.block.plsda --- tests/testthat/test-tune.block.plsda.R | 46 ++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 tests/testthat/test-tune.block.plsda.R diff --git a/tests/testthat/test-tune.block.plsda.R b/tests/testthat/test-tune.block.plsda.R new file mode 100644 index 00000000..5bbe8210 --- /dev/null +++ b/tests/testthat/test-tune.block.plsda.R @@ -0,0 +1,46 @@ +context("tune.block.plsda") +library(BiocParallel) + +test_that("tune.block.plsda works and is the same as perf alone and in tune wrapper", code = { + + data("breast.TCGA") + X <- list(mrna = breast.TCGA$data.train$mrna, mirna = breast.TCGA$data.train$mirna, protein = breast.TCGA$data.train$protein) + Y <- breast.TCGA$data.train$subtype + set.seed(100) + subset <- mixOmics:::stratified.subsampling(breast.TCGA$data.train$subtype, folds = 10)[[1]][[1]] + X <- lapply(X, function(omic) omic[subset,]) + Y <- Y[subset] + design = matrix(1, ncol = length(X), nrow = length(X), + dimnames = list(names(X), names(X))) + diag(design) = 0 + design + + # run tune.block.plsda + tune.res.1 <- suppressWarnings( + tune.block.plsda(X, Y, design = design, ncomp = 2, nrepeat = 1, folds = 3, + seed = 13, dist = c("all")) + ) + # run tune wrapper + tune.res.2 <- suppressWarnings( + tune(X, Y, design = design, ncomp = 2, nrepeat = 1, folds = 3, + seed = 13, dist = c("all"), method = "block.plsda") + ) + # run perf + model <- block.plsda(X, Y, design = design, ncomp = 2) + tune.res.3 <- suppressWarnings( + perf(model, nrepeat = 1, seed = 13, folds = 3) + ) + + # check outputs format + expect_equal(class(tune.res.1)[1], "perf.sgccda.mthd") + expect_equal(class(tune.res.2)[1], "perf.sgccda.mthd") + expect_equal(class(tune.res.3)[1], "perf.sgccda.mthd") + + # check output values + .expect_numerically_close(tune.res.1$error.rate.per.class$mrna$max.dist[3,2], 0.25) + .expect_numerically_close(tune.res.2$error.rate.per.class$mrna$max.dist[3,2], 0.25) + .expect_numerically_close(tune.res.3$error.rate.per.class$mrna$max.dist[3,2], 0.25) + +}) + + \ No newline at end of file From 045d1d8ff497b0181b3c907941917660e536fe66 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 16:41:05 +1100 Subject: [PATCH 63/75] add unit test for tune.mint.plsda --- tests/testthat/test-tune.mint.plsda.R | 38 +++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 tests/testthat/test-tune.mint.plsda.R diff --git a/tests/testthat/test-tune.mint.plsda.R b/tests/testthat/test-tune.mint.plsda.R new file mode 100644 index 00000000..30c515ac --- /dev/null +++ b/tests/testthat/test-tune.mint.plsda.R @@ -0,0 +1,38 @@ +context("tune.mint.plsda") +library(BiocParallel) + +test_that("tune.mint.plsda works and is the same perf alone and in tune wrapper", code = { + + # set up data + data(stemcells) + X = stemcells$gene + Y = stemcells$celltype + study = stemcells$study + + # run alone + tune.res.1 = suppressWarnings( + tune.mint.plsda(X, Y, study = study, ncomp = 2) + ) + + # run in wrapper + tune.res.2 = suppressWarnings( + tune(method = "mint.plsda", X, Y, study = study, ncomp = 2) + ) + + # run perf + model <- mint.plsda(X, Y, ncomp = 2, study = study) + tune.res.3 = suppressWarnings( + perf(model, ncomp = 2) + ) + + # check outputs format + expect_equal(class(tune.res.1)[1], "perf") + expect_equal(class(tune.res.2)[1], "perf") + expect_equal(class(tune.res.3)[1], "perf") + + # check outputs values + .expect_numerically_close(tune.res.1$global.error$BER[1,1], 0.3803556) + .expect_numerically_close(tune.res.2$global.error$BER[1,1], 0.3803556) + .expect_numerically_close(tune.res.3$global.error$BER[1,1], 0.3803556) + +}) \ No newline at end of file From 7f255dbe0e770f3296cc7599ec28a76e987dcdba Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 17:06:30 +1100 Subject: [PATCH 64/75] shortern example lines --- examples/perf.assess-examples.R | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/examples/perf.assess-examples.R b/examples/perf.assess-examples.R index db325574..30a33fd2 100644 --- a/examples/perf.assess-examples.R +++ b/examples/perf.assess-examples.R @@ -7,7 +7,8 @@ length(liver.toxicity$treatment$Treatment.Group) # 64 samples plsda.res <- plsda(liver.toxicity$gene, liver.toxicity$treatment$Treatment.Group, ncomp = 2) performance <- perf.assess(plsda.res, - validation = "Mfold", folds = 3, nrepeat = 10, # to make sure each fold has all classes represented + # to make sure each fold has all classes represented + validation = "Mfold", folds = 3, nrepeat = 10, seed = 12) # for reproducibility, remove for analysis performance$error.rate$BER @@ -18,7 +19,7 @@ splsda.res <- splsda(liver.toxicity$gene, liver.toxicity$treatment$Treatment.Gro keepX = c(25, 25), ncomp = 2) performance <- perf.assess(splsda.res, - validation = "Mfold", folds = 3, nrepeat = 10, # to make sure each fold has all classes represented + validation = "Mfold", folds = 3, nrepeat = 10, seed = 12) performance$error.rate$BER # can see slight improvement in error rate over PLS-DA example @@ -30,20 +31,22 @@ ncol(liver.toxicity$clinic) # 10 Y variables as output of PLS model pls.res <- pls(liver.toxicity$gene, liver.toxicity$clinic, ncomp = 2) performance <- perf.assess(pls.res, - validation = "Mfold", folds = 3, nrepeat = 10, # to make sure each fold has all classes represented + validation = "Mfold", folds = 3, nrepeat = 10, seed = 12) -performance$measures$Q2$summary # see Q2 which gives indication of predictive ability for each of the 10 Y outputs +# see Q2 which gives indication of predictive ability for each of the 10 Y outputs +performance$measures$Q2$summary ## sPLS example spls.res <- spls(liver.toxicity$gene, liver.toxicity$clinic, ncomp = 2, keepX = c(50, 50)) performance <- perf.assess(spls.res, - validation = "Mfold", folds = 3, nrepeat = 10, # to make sure each fold has all classes represented + validation = "Mfold", folds = 3, nrepeat = 10, seed = 12) -performance$measures$Q2$summary # see Q2 which gives indication of predictive ability for each of the 10 Y outputs +# see Q2 which gives indication of predictive ability for each of the 10 Y outputs +performance$measures$Q2$summary ## block PLS-DA example From 64af0ef9428e12d2d6b5ca03fdcd30d9588c2de8 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Wed, 27 Nov 2024 17:25:33 +1100 Subject: [PATCH 65/75] add testing to make sure tune outputs can plot without error --- tests/testthat/test-tune.block.plsda.R | 5 +++++ tests/testthat/test-tune.block.splsda.R | 5 +++++ tests/testthat/test-tune.mint.plsda.R | 6 ++++++ tests/testthat/test-tune.mint.splsda.R | 6 ++++++ tests/testthat/test-tune.pca.R | 3 +++ tests/testthat/test-tune.pls.R | 5 +++++ tests/testthat/test-tune.plsda.R | 5 +++++ tests/testthat/test-tune.rcc.R | 14 ++++++++++++++ tests/testthat/test-tune.spca.R | 6 ++++++ tests/testthat/test-tune.spls.R | 6 ++++++ tests/testthat/test-tune.splsda.R | 6 ++++++ 11 files changed, 67 insertions(+) diff --git a/tests/testthat/test-tune.block.plsda.R b/tests/testthat/test-tune.block.plsda.R index 5bbe8210..21ebdf7f 100644 --- a/tests/testthat/test-tune.block.plsda.R +++ b/tests/testthat/test-tune.block.plsda.R @@ -41,6 +41,11 @@ test_that("tune.block.plsda works and is the same as perf alone and in tune wrap .expect_numerically_close(tune.res.2$error.rate.per.class$mrna$max.dist[3,2], 0.25) .expect_numerically_close(tune.res.3$error.rate.per.class$mrna$max.dist[3,2], 0.25) + # check can plot outputs + expect_silent(plot(tune.res.1)) + expect_silent(plot(tune.res.2)) + expect_silent(plot(tune.res.3)) + }) \ No newline at end of file diff --git a/tests/testthat/test-tune.block.splsda.R b/tests/testthat/test-tune.block.splsda.R index b1da06e2..6c24f436 100644 --- a/tests/testthat/test-tune.block.splsda.R +++ b/tests/testthat/test-tune.block.splsda.R @@ -157,6 +157,11 @@ test_that("tune.block.splsda works independently and in tune wrapper the same", BPPARAM = SerialParam(), seed = 42 ) expect_equal(tune11$choice.keepX, tune41$choice.keepX) + + # check can plot outputs + expect_silent(plot(tune41)) + expect_silent(plot(tune11)) + }) test_that("tune.block.splsda works when test.keepX = NULL and gives same result as perf()", code = { diff --git a/tests/testthat/test-tune.mint.plsda.R b/tests/testthat/test-tune.mint.plsda.R index 30c515ac..34ee9b42 100644 --- a/tests/testthat/test-tune.mint.plsda.R +++ b/tests/testthat/test-tune.mint.plsda.R @@ -35,4 +35,10 @@ test_that("tune.mint.plsda works and is the same perf alone and in tune wrapper" .expect_numerically_close(tune.res.2$global.error$BER[1,1], 0.3803556) .expect_numerically_close(tune.res.3$global.error$BER[1,1], 0.3803556) + # check can plot outputs + expect_silent(plot(tune.res.1)) + expect_silent(plot(tune.res.2)) + expect_silent(plot(tune.res.3)) + + }) \ No newline at end of file diff --git a/tests/testthat/test-tune.mint.splsda.R b/tests/testthat/test-tune.mint.splsda.R index 68b826b2..58630e7e 100644 --- a/tests/testthat/test-tune.mint.splsda.R +++ b/tests/testthat/test-tune.mint.splsda.R @@ -20,6 +20,9 @@ test_that("tune.mint.splsda works", code = { expect_is(out, "tune.mint.splsda") expect_equal(out$choice.ncomp$ncomp, 1) + # check can plot + expect_silent(plot(out)) + }) test_that("tune.mint.splsda works with custom alpha", code = { @@ -43,6 +46,9 @@ test_that("tune.mint.splsda works with custom alpha", code = { expect_is(out, "tune.mint.splsda") expect_equal(out$choice.ncomp$ncomp, 1) + # check can plot + expect_silent(plot(out)) + }) test_that("tune.mint.splsda works when test.keepX = NULL and gives same result as perf()", code = { diff --git a/tests/testthat/test-tune.pca.R b/tests/testthat/test-tune.pca.R index c1bcfa8f..98b61e45 100644 --- a/tests/testthat/test-tune.pca.R +++ b/tests/testthat/test-tune.pca.R @@ -8,4 +8,7 @@ test_that("tune.pca and tune(method='pca') are equivalent", { # expect results the same expect_equal(object1$ncomp, object2$ncomp) expect_equal(object1$sdev[1], object2$sdev[1]) + # check can plot outputs without errors + expect_silent(plot(object1)) + expect_silent(plot(object2)) }) diff --git a/tests/testthat/test-tune.pls.R b/tests/testthat/test-tune.pls.R index d596e0e0..0bdb1012 100644 --- a/tests/testthat/test-tune.pls.R +++ b/tests/testthat/test-tune.pls.R @@ -30,5 +30,10 @@ test_that("tune.pls works and is the same as perf alone and in tune wrapper", co .expect_numerically_close(tune.res.2$measures$Q2$summary[1,3], -0.1304012) .expect_numerically_close(tune.res.3$measures$Q2$summary[1,3], -0.1304012) + # check can plot outputs + expect_silent(plot(tune.res.1)) + expect_silent(plot(tune.res.2)) + expect_silent(plot(tune.res.3)) + }) \ No newline at end of file diff --git a/tests/testthat/test-tune.plsda.R b/tests/testthat/test-tune.plsda.R index f66c1ba5..d84c1e89 100644 --- a/tests/testthat/test-tune.plsda.R +++ b/tests/testthat/test-tune.plsda.R @@ -42,4 +42,9 @@ test_that("tune.plsda works and is the same perf alone and in tune wrapper", cod .expect_numerically_close(tune.plsda.res.2$error.rate$overall[1,1], 0.5106383) .expect_numerically_close(tune.plsda.res.3$error.rate$overall[1,1], 0.5106383) + # check can plot + expect_silent(plot(tune.plsda.res.1)) + expect_silent(plot(tune.plsda.res.2)) + expect_silent(plot(tune.plsda.res.3)) + }) \ No newline at end of file diff --git a/tests/testthat/test-tune.rcc.R b/tests/testthat/test-tune.rcc.R index 26851c14..7c241f45 100644 --- a/tests/testthat/test-tune.rcc.R +++ b/tests/testthat/test-tune.rcc.R @@ -14,6 +14,9 @@ test_that("tune.rcc works with Mfold method", code = { expect_equal(class(tune.rcc.res), "tune.rcc") expect_equal(tune.rcc.res$opt.lambda1, 0.5005) expect_equal(tune.rcc.res$grid1, c(0.00100, 0.25075, 0.50050, 0.75025, 1.00000)) + + # check can plot + expect_silent(plot(tune.rcc.res)) }) test_that("tune.rcc works with loo method", code = { @@ -29,6 +32,9 @@ test_that("tune.rcc works with loo method", code = { expect_equal(class(tune.rcc.res), "tune.rcc") expect_equal(tune.rcc.res$opt.lambda1, 0.25075) expect_equal(tune.rcc.res$grid1, c(0.00100, 0.25075, 0.50050, 0.75025, 1.00000)) + + # check can plot + expect_silent(plot(tune.rcc.res)) }) test_that("tune.rcc works in parallel same as in series", code = { @@ -48,6 +54,10 @@ test_that("tune.rcc works in parallel same as in series", code = { expect_equal(class(tune.rcc.res), "tune.rcc") expect_equal(tune.rcc.res$opt.lambda2, tune.rcc.res.parallel$opt.lambda2) expect_equal(tune.rcc.res$grid2, tune.rcc.res.parallel$grid2) + + # check can plot + expect_silent(plot(tune.rcc.res)) + expect_silent(plot(tune.rcc.res.parallel)) }) test_that("tune.rcc and tune(method='rcc') are equivalent", { @@ -75,5 +85,9 @@ test_that("tune.rcc and tune(method='rcc') are equivalent", { expect_equal(tune.rcc.res.1$opt.lambda2, tune.rcc.res.2$opt.lambda2) expect_equal(tune.rcc.res.1$grid2, tune.rcc.res.2$grid2) + # check can plot + expect_silent(plot(tune.rcc.res.1)) + expect_silent(plot(tune.rcc.res.2)) + }) diff --git a/tests/testthat/test-tune.spca.R b/tests/testthat/test-tune.spca.R index dc8559df..61e415db 100644 --- a/tests/testthat/test-tune.spca.R +++ b/tests/testthat/test-tune.spca.R @@ -19,6 +19,9 @@ test_that("tune.spca works in serial and parallel", { expect_equal(object_parallel$choice.keepX[[1]], 35) expect_equal(object_parallel$choice.keepX[[2]], 5) .expect_numerically_close(object_parallel$cor.comp$comp1[1,2], 0.3994544) + # test can plot outputs + expect_silent(plot(object_serial)) + expect_silent(plot(object_parallel)) }) test_that("tune.spca same result in serial and parallel", { @@ -77,5 +80,8 @@ test_that("tune.spca and tune(method='spca') are equivalent", { expect_equal(object1$choice.keepX[[1]], object2$choice.keepX[[1]]) expect_equal(object1$choice.keepX[[2]], object2$choice.keepX[[2]]) .expect_numerically_close(object1$cor.comp$comp1[3,3], object2$cor.comp$comp1[3,3]) + # test can plot outputs + expect_silent(plot(object1)) + expect_silent(plot(object2)) }) \ No newline at end of file diff --git a/tests/testthat/test-tune.spls.R b/tests/testthat/test-tune.spls.R index 7b923237..ddd860c8 100644 --- a/tests/testthat/test-tune.spls.R +++ b/tests/testthat/test-tune.spls.R @@ -64,6 +64,12 @@ test_that("tune.spls works and is the same in parallel and when run in tune wrap .expect_numerically_close(tune.spls.res.1$measure.pred$mean, tune.spls.res.2$measure.pred$mean) .expect_numerically_close(tune.spls.res.1$measure.pred$mean, tune.spls.res.3$measure.pred$mean) .expect_numerically_close(tune.spls.res.1$measure.pred$mean, tune.spls.res.4$measure.pred$mean) + + # check can plot outputs + expect_silent(plot(tune.spls.res.1)) + expect_silent(plot(tune.spls.res.2)) + expect_silent(plot(tune.spls.res.3)) + expect_silent(plot(tune.spls.res.4)) }) ## If ncol(Y) == 1 tune.spls calls tune.spls1 diff --git a/tests/testthat/test-tune.splsda.R b/tests/testthat/test-tune.splsda.R index 7528a032..4abe2f18 100644 --- a/tests/testthat/test-tune.splsda.R +++ b/tests/testthat/test-tune.splsda.R @@ -45,6 +45,12 @@ test_that("tune.splsda works and is the same in parallel and when run in tune wr .expect_numerically_close(tune.splsda.res.3$error.rate[1,1], 0.3111111) .expect_numerically_close(tune.splsda.res.4$error.rate[1,1], 0.3111111) + # check can plot outputs + expect_silent(plot(tune.splsda.res.1)) + expect_silent(plot(tune.splsda.res.2)) + expect_silent(plot(tune.splsda.res.3)) + expect_silent(plot(tune.splsda.res.4)) + }) test_that("tune.splsda works when test.keepX = NULL and gives same result as perf()", code = { From 4f9d3592a5311ed4677dab7c7effbbf40a72e083 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Thu, 28 Nov 2024 11:44:14 +1100 Subject: [PATCH 66/75] fix unit tests checking plotting properly --- man/perf.assess.Rd | 15 +++++++++------ tests/testthat/test-tune.block.plsda.R | 2 ++ tests/testthat/test-tune.block.splsda.R | 2 ++ tests/testthat/test-tune.mint.plsda.R | 2 ++ tests/testthat/test-tune.mint.splsda.R | 8 ++++++-- tests/testthat/test-tune.pca.R | 2 ++ tests/testthat/test-tune.pls.R | 2 ++ tests/testthat/test-tune.plsda.R | 2 ++ tests/testthat/test-tune.rcc.R | 8 ++++++++ tests/testthat/test-tune.spca.R | 4 ++++ tests/testthat/test-tune.splsda.R | 12 +++++++----- 11 files changed, 46 insertions(+), 13 deletions(-) diff --git a/man/perf.assess.Rd b/man/perf.assess.Rd index 914b96fa..84a7de9e 100644 --- a/man/perf.assess.Rd +++ b/man/perf.assess.Rd @@ -302,7 +302,8 @@ length(liver.toxicity$treatment$Treatment.Group) # 64 samples plsda.res <- plsda(liver.toxicity$gene, liver.toxicity$treatment$Treatment.Group, ncomp = 2) performance <- perf.assess(plsda.res, - validation = "Mfold", folds = 3, nrepeat = 10, # to make sure each fold has all classes represented + # to make sure each fold has all classes represented + validation = "Mfold", folds = 3, nrepeat = 10, seed = 12) # for reproducibility, remove for analysis performance$error.rate$BER @@ -313,7 +314,7 @@ splsda.res <- splsda(liver.toxicity$gene, liver.toxicity$treatment$Treatment.Gro keepX = c(25, 25), ncomp = 2) performance <- perf.assess(splsda.res, - validation = "Mfold", folds = 3, nrepeat = 10, # to make sure each fold has all classes represented + validation = "Mfold", folds = 3, nrepeat = 10, seed = 12) performance$error.rate$BER # can see slight improvement in error rate over PLS-DA example @@ -325,20 +326,22 @@ ncol(liver.toxicity$clinic) # 10 Y variables as output of PLS model pls.res <- pls(liver.toxicity$gene, liver.toxicity$clinic, ncomp = 2) performance <- perf.assess(pls.res, - validation = "Mfold", folds = 3, nrepeat = 10, # to make sure each fold has all classes represented + validation = "Mfold", folds = 3, nrepeat = 10, seed = 12) -performance$measures$Q2$summary # see Q2 which gives indication of predictive ability for each of the 10 Y outputs +# see Q2 which gives indication of predictive ability for each of the 10 Y outputs +performance$measures$Q2$summary ## sPLS example spls.res <- spls(liver.toxicity$gene, liver.toxicity$clinic, ncomp = 2, keepX = c(50, 50)) performance <- perf.assess(spls.res, - validation = "Mfold", folds = 3, nrepeat = 10, # to make sure each fold has all classes represented + validation = "Mfold", folds = 3, nrepeat = 10, seed = 12) -performance$measures$Q2$summary # see Q2 which gives indication of predictive ability for each of the 10 Y outputs +# see Q2 which gives indication of predictive ability for each of the 10 Y outputs +performance$measures$Q2$summary ## block PLS-DA example diff --git a/tests/testthat/test-tune.block.plsda.R b/tests/testthat/test-tune.block.plsda.R index 21ebdf7f..9ede1dd0 100644 --- a/tests/testthat/test-tune.block.plsda.R +++ b/tests/testthat/test-tune.block.plsda.R @@ -42,6 +42,8 @@ test_that("tune.block.plsda works and is the same as perf alone and in tune wrap .expect_numerically_close(tune.res.3$error.rate.per.class$mrna$max.dist[3,2], 0.25) # check can plot outputs + pdf(NULL) + on.exit(dev.off()) expect_silent(plot(tune.res.1)) expect_silent(plot(tune.res.2)) expect_silent(plot(tune.res.3)) diff --git a/tests/testthat/test-tune.block.splsda.R b/tests/testthat/test-tune.block.splsda.R index 6c24f436..a5caf1f5 100644 --- a/tests/testthat/test-tune.block.splsda.R +++ b/tests/testthat/test-tune.block.splsda.R @@ -159,6 +159,8 @@ test_that("tune.block.splsda works independently and in tune wrapper the same", expect_equal(tune11$choice.keepX, tune41$choice.keepX) # check can plot outputs + pdf(NULL) + on.exit(dev.off()) expect_silent(plot(tune41)) expect_silent(plot(tune11)) diff --git a/tests/testthat/test-tune.mint.plsda.R b/tests/testthat/test-tune.mint.plsda.R index 34ee9b42..d65b575b 100644 --- a/tests/testthat/test-tune.mint.plsda.R +++ b/tests/testthat/test-tune.mint.plsda.R @@ -36,6 +36,8 @@ test_that("tune.mint.plsda works and is the same perf alone and in tune wrapper" .expect_numerically_close(tune.res.3$global.error$BER[1,1], 0.3803556) # check can plot outputs + pdf(NULL) + on.exit(dev.off()) expect_silent(plot(tune.res.1)) expect_silent(plot(tune.res.2)) expect_silent(plot(tune.res.3)) diff --git a/tests/testthat/test-tune.mint.splsda.R b/tests/testthat/test-tune.mint.splsda.R index 58630e7e..8719f6bf 100644 --- a/tests/testthat/test-tune.mint.splsda.R +++ b/tests/testthat/test-tune.mint.splsda.R @@ -21,7 +21,9 @@ test_that("tune.mint.splsda works", code = { expect_equal(out$choice.ncomp$ncomp, 1) # check can plot - expect_silent(plot(out)) + pdf(NULL) + on.exit(dev.off()) + expect_error(plot(out), NA) # makes note about not being able to plot SD bars }) @@ -47,7 +49,9 @@ test_that("tune.mint.splsda works with custom alpha", code = { expect_equal(out$choice.ncomp$ncomp, 1) # check can plot - expect_silent(plot(out)) + pdf(NULL) + on.exit(dev.off()) + expect_error(plot(out), NA) # makes note about not being able to plot SD bars }) diff --git a/tests/testthat/test-tune.pca.R b/tests/testthat/test-tune.pca.R index 98b61e45..77e9afac 100644 --- a/tests/testthat/test-tune.pca.R +++ b/tests/testthat/test-tune.pca.R @@ -9,6 +9,8 @@ test_that("tune.pca and tune(method='pca') are equivalent", { expect_equal(object1$ncomp, object2$ncomp) expect_equal(object1$sdev[1], object2$sdev[1]) # check can plot outputs without errors + pdf(NULL) + on.exit(dev.off()) expect_silent(plot(object1)) expect_silent(plot(object2)) }) diff --git a/tests/testthat/test-tune.pls.R b/tests/testthat/test-tune.pls.R index 0bdb1012..9b0fba35 100644 --- a/tests/testthat/test-tune.pls.R +++ b/tests/testthat/test-tune.pls.R @@ -31,6 +31,8 @@ test_that("tune.pls works and is the same as perf alone and in tune wrapper", co .expect_numerically_close(tune.res.3$measures$Q2$summary[1,3], -0.1304012) # check can plot outputs + pdf(NULL) + on.exit(dev.off()) expect_silent(plot(tune.res.1)) expect_silent(plot(tune.res.2)) expect_silent(plot(tune.res.3)) diff --git a/tests/testthat/test-tune.plsda.R b/tests/testthat/test-tune.plsda.R index d84c1e89..9ee1a112 100644 --- a/tests/testthat/test-tune.plsda.R +++ b/tests/testthat/test-tune.plsda.R @@ -43,6 +43,8 @@ test_that("tune.plsda works and is the same perf alone and in tune wrapper", cod .expect_numerically_close(tune.plsda.res.3$error.rate$overall[1,1], 0.5106383) # check can plot + pdf(NULL) + on.exit(dev.off()) expect_silent(plot(tune.plsda.res.1)) expect_silent(plot(tune.plsda.res.2)) expect_silent(plot(tune.plsda.res.3)) diff --git a/tests/testthat/test-tune.rcc.R b/tests/testthat/test-tune.rcc.R index 7c241f45..7cf83a5c 100644 --- a/tests/testthat/test-tune.rcc.R +++ b/tests/testthat/test-tune.rcc.R @@ -16,6 +16,8 @@ test_that("tune.rcc works with Mfold method", code = { expect_equal(tune.rcc.res$grid1, c(0.00100, 0.25075, 0.50050, 0.75025, 1.00000)) # check can plot + pdf(NULL) + on.exit(dev.off()) expect_silent(plot(tune.rcc.res)) }) @@ -34,6 +36,8 @@ test_that("tune.rcc works with loo method", code = { expect_equal(tune.rcc.res$grid1, c(0.00100, 0.25075, 0.50050, 0.75025, 1.00000)) # check can plot + pdf(NULL) + on.exit(dev.off()) expect_silent(plot(tune.rcc.res)) }) @@ -56,6 +60,8 @@ test_that("tune.rcc works in parallel same as in series", code = { expect_equal(tune.rcc.res$grid2, tune.rcc.res.parallel$grid2) # check can plot + pdf(NULL) + on.exit(dev.off()) expect_silent(plot(tune.rcc.res)) expect_silent(plot(tune.rcc.res.parallel)) }) @@ -86,6 +92,8 @@ test_that("tune.rcc and tune(method='rcc') are equivalent", { expect_equal(tune.rcc.res.1$grid2, tune.rcc.res.2$grid2) # check can plot + pdf(NULL) + on.exit(dev.off()) expect_silent(plot(tune.rcc.res.1)) expect_silent(plot(tune.rcc.res.2)) diff --git a/tests/testthat/test-tune.spca.R b/tests/testthat/test-tune.spca.R index 61e415db..d4fd6c75 100644 --- a/tests/testthat/test-tune.spca.R +++ b/tests/testthat/test-tune.spca.R @@ -20,6 +20,8 @@ test_that("tune.spca works in serial and parallel", { expect_equal(object_parallel$choice.keepX[[2]], 5) .expect_numerically_close(object_parallel$cor.comp$comp1[1,2], 0.3994544) # test can plot outputs + pdf(NULL) + on.exit(dev.off()) expect_silent(plot(object_serial)) expect_silent(plot(object_parallel)) }) @@ -81,6 +83,8 @@ test_that("tune.spca and tune(method='spca') are equivalent", { expect_equal(object1$choice.keepX[[2]], object2$choice.keepX[[2]]) .expect_numerically_close(object1$cor.comp$comp1[3,3], object2$cor.comp$comp1[3,3]) # test can plot outputs + pdf(NULL) + on.exit(dev.off()) expect_silent(plot(object1)) expect_silent(plot(object2)) }) diff --git a/tests/testthat/test-tune.splsda.R b/tests/testthat/test-tune.splsda.R index 4abe2f18..3116f99f 100644 --- a/tests/testthat/test-tune.splsda.R +++ b/tests/testthat/test-tune.splsda.R @@ -31,11 +31,13 @@ test_that("tune.splsda works and is the same in parallel and when run in tune wr method = "splsda") - # check outputs + # check outputs format expect_equal(class(tune.splsda.res.1), "tune.splsda") expect_equal(class(tune.splsda.res.2), "tune.splsda") expect_equal(class(tune.splsda.res.3), "tune.splsda") expect_equal(class(tune.splsda.res.4), "tune.splsda") + + # check output values expect_equal(unname(tune.splsda.res.1$choice.keepX), c(10,15)) expect_equal(unname(tune.splsda.res.2$choice.keepX), c(10,15)) expect_equal(unname(tune.splsda.res.3$choice.keepX), c(10,15)) @@ -46,10 +48,10 @@ test_that("tune.splsda works and is the same in parallel and when run in tune wr .expect_numerically_close(tune.splsda.res.4$error.rate[1,1], 0.3111111) # check can plot outputs - expect_silent(plot(tune.splsda.res.1)) - expect_silent(plot(tune.splsda.res.2)) - expect_silent(plot(tune.splsda.res.3)) - expect_silent(plot(tune.splsda.res.4)) + expect_is(plot(tune.splsda.res.1), "ggplot") + expect_is(plot(tune.splsda.res.2), "ggplot") + expect_is(plot(tune.splsda.res.3), "ggplot") + expect_is(plot(tune.splsda.res.4), "ggplot") }) From 7fc80d0f5fe432b7874742f6c16d3814630bed08 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Thu, 28 Nov 2024 12:03:59 +1100 Subject: [PATCH 67/75] add example for tune.pca multilevel --- R/tune.pca.R | 7 ++----- examples/tune.pca-examples.R | 17 +++++++++++++++++ man/tune.pca.Rd | 15 ++++++++++++++- 3 files changed, 33 insertions(+), 6 deletions(-) create mode 100644 examples/tune.pca-examples.R diff --git a/R/tune.pca.R b/R/tune.pca.R index e1a62e2e..abc2f985 100644 --- a/R/tune.pca.R +++ b/R/tune.pca.R @@ -52,11 +52,8 @@ #' for more details. #' @keywords algebra #' @export -#' @examples -#' data(liver.toxicity) -#' tune <- tune.pca(liver.toxicity$gene, center = TRUE, scale = TRUE) -#' tune -#' plot(tune) +#' @example ./examples/tune.pca-examples.R + tune.pca <- function(X, ncomp = NULL, diff --git a/examples/tune.pca-examples.R b/examples/tune.pca-examples.R new file mode 100644 index 00000000..5f9ec17a --- /dev/null +++ b/examples/tune.pca-examples.R @@ -0,0 +1,17 @@ +# load data +data(liver.toxicity) + +# run tuning +tune <- tune.pca(liver.toxicity$gene, center = TRUE, scale = TRUE) +plot(tune) + +# set up multilevel dataset +repeat.indiv <- c(1, 2, 1, 2, 1, 2, 1, 2, 3, 3, 4, 3, 4, 3, 4, 4, 5, 6, 5, 5, + 6, 5, 6, 7, 7, 8, 6, 7, 8, 7, 8, 8, 9, 10, 9, 10, 11, 9, 9, + 10, 11, 12, 12, 10, 11, 12, 11, 12, 13, 14, 13, 14, 13, 14, + 13, 14, 15, 16, 15, 16, 15, 16, 15, 16) +design <- data.frame(sample = repeat.indiv) + +# run tuning +tune <- tune.pca(liver.toxicity$gene, center = TRUE, scale = TRUE, multilevel = design) +plot(tune) \ No newline at end of file diff --git a/man/tune.pca.Rd b/man/tune.pca.Rd index 1135e1f3..dc808fbe 100644 --- a/man/tune.pca.Rd +++ b/man/tune.pca.Rd @@ -87,9 +87,22 @@ internal pre-processing step, through \code{\link{logratio.transfo}} and \code{\link{withinVariation}} respectively. } \examples{ +# load data data(liver.toxicity) + +# run tuning tune <- tune.pca(liver.toxicity$gene, center = TRUE, scale = TRUE) -tune +plot(tune) + +# set up multilevel dataset +repeat.indiv <- c(1, 2, 1, 2, 1, 2, 1, 2, 3, 3, 4, 3, 4, 3, 4, 4, 5, 6, 5, 5, + 6, 5, 6, 7, 7, 8, 6, 7, 8, 7, 8, 8, 9, 10, 9, 10, 11, 9, 9, + 10, 11, 12, 12, 10, 11, 12, 11, 12, 13, 14, 13, 14, 13, 14, + 13, 14, 15, 16, 15, 16, 15, 16, 15, 16) +design <- data.frame(sample = repeat.indiv) + +# run tuning +tune <- tune.pca(liver.toxicity$gene, center = TRUE, scale = TRUE, multilevel = design) plot(tune) } \seealso{ From 36b1ca82eb223a7f36962c4e7adc87b58b3b2616 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Thu, 28 Nov 2024 12:18:03 +1100 Subject: [PATCH 68/75] add multilevel examples to tune.pls and tune.spls --- examples/tune.pls-examples.R | 26 ++++++++++++++++------ examples/tune.spls-examples.R | 42 +++++++++++++++++++++++++++-------- 2 files changed, 52 insertions(+), 16 deletions(-) diff --git a/examples/tune.pls-examples.R b/examples/tune.pls-examples.R index 3afb183d..cb945cc0 100644 --- a/examples/tune.pls-examples.R +++ b/examples/tune.pls-examples.R @@ -3,15 +3,27 @@ data(liver.toxicity) X <- liver.toxicity$gene Y <- liver.toxicity$clinic -# tune pls model to find optimal number of components -tune.res <- tune.pls( X, Y, ncomp = 10, measure = "cor", - folds = 5, nrepeat = 3, progressBar = TRUE) +# tune PLS2 model to find optimal number of components +tune.res <- tune.pls(X, Y, ncomp = 10, measure = "cor", + folds = 5, nrepeat = 3, progressBar = TRUE) plot(tune.res) # plot outputs -# PLS1 model +# PLS1 model example Y1 <- liver.toxicity$clinic[,1] -tune.res <- tune.pls( X, Y1, ncomp = 10, measure = "cor", - folds = 5, nrepeat = 3, progressBar = TRUE) +tune.res <- tune.pls(X, Y1, ncomp = 10, measure = "cor", + folds = 5, nrepeat = 3, progressBar = TRUE) -plot(tune.res) \ No newline at end of file +plot(tune.res) + +# Multilevel PLS2 model +repeat.indiv <- c(1, 2, 1, 2, 1, 2, 1, 2, 3, 3, 4, 3, 4, 3, 4, 4, 5, 6, 5, 5, + 6, 5, 6, 7, 7, 8, 6, 7, 8, 7, 8, 8, 9, 10, 9, 10, 11, 9, 9, + 10, 11, 12, 12, 10, 11, 12, 11, 12, 13, 14, 13, 14, 13, 14, + 13, 14, 15, 16, 15, 16, 15, 16, 15, 16) +design <- data.frame(sample = repeat.indiv) + +tune.res <- tune.pls(X, Y1, ncomp = 10, measure = "cor", multilevel = design, + folds = 5, nrepeat = 3, progressBar = TRUE) + +plot(tune) \ No newline at end of file diff --git a/examples/tune.spls-examples.R b/examples/tune.spls-examples.R index 781c2469..07f8d948 100644 --- a/examples/tune.spls-examples.R +++ b/examples/tune.spls-examples.R @@ -1,4 +1,4 @@ -## sPLS2 model (i.e. more than one Y outcome variable) +## sPLS2 model example (more than one Y outcome variable) # set up data data(liver.toxicity) @@ -14,22 +14,46 @@ plot(tune.res.ncomp) # plot outputs # tune spls model for number of X and Y variables to keep tune.res <- tune.spls( X, Y, ncomp = 3, - test.keepX = c(5, 10, 15), - test.keepY = c(3, 6, 8), measure = "cor", - folds = 5, nrepeat = 3, progressBar = TRUE) + test.keepX = c(5, 10, 15), + test.keepY = c(3, 6, 8), measure = "cor", + folds = 5, nrepeat = 3, progressBar = TRUE) plot(tune.res) # plot outputs -## sPLS1 model (i.e. only one Y outcome variable) +## sPLS1 model example (only one Y outcome variable) # set up data Y1 <- liver.toxicity$clinic[,1] # tune spls model for components only plot(tune.spls(X, Y1, ncomp = 3, -folds = 3, -test.keepX = NULL, test.keepY = NULL)) + folds = 3, + test.keepX = NULL, test.keepY = NULL)) # tune spls model for number of X variables to keep, note for sPLS1 models 'measure' needs to be set plot(tune.spls(X, Y1, ncomp = 3, -folds = 3, measure = "MSE", -test.keepX = c(5, 10, 15), test.keepY = c(3, 6, 8))) \ No newline at end of file + folds = 3, measure = "MSE", + test.keepX = c(5, 10, 15), test.keepY = c(3, 6, 8))) + + +## sPLS2 multilevel model example + +# set up multilevel design +repeat.indiv <- c(1, 2, 1, 2, 1, 2, 1, 2, 3, 3, 4, 3, 4, 3, 4, 4, 5, 6, 5, 5, + 6, 5, 6, 7, 7, 8, 6, 7, 8, 7, 8, 8, 9, 10, 9, 10, 11, 9, 9, + 10, 11, 12, 12, 10, 11, 12, 11, 12, 13, 14, 13, 14, 13, 14, + 13, 14, 15, 16, 15, 16, 15, 16, 15, 16) +design <- data.frame(sample = repeat.indiv) + +# tune spls model for components only +tune.res.ncomp <- tune.spls( X, Y, ncomp = 5, + test.keepX = NULL, + test.keepY = NULL, measure = "cor", multilevel = design, + folds = 5, nrepeat = 3, progressBar = TRUE) +plot(tune.res.ncomp) # plot outputs + +# tune spls model for number of X and Y variables to keep +tune.res <- tune.spls( X, Y, ncomp = 3, + test.keepX = c(5, 10, 15), + test.keepY = c(3, 6, 8), measure = "cor", multilevel = design, + folds = 5, nrepeat = 3, progressBar = TRUE) +plot(tune.res) # plot outputs From da5fec570433e2b888f04997c4bbccb6d0380bb9 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Thu, 28 Nov 2024 12:18:08 +1100 Subject: [PATCH 69/75] doc --- man/tune.pls.Rd | 24 ++++++++++++++++++------ man/tune.spls.Rd | 42 +++++++++++++++++++++++++++++++++--------- 2 files changed, 51 insertions(+), 15 deletions(-) diff --git a/man/tune.pls.Rd b/man/tune.pls.Rd index 4401e4ef..18b20e75 100644 --- a/man/tune.pls.Rd +++ b/man/tune.pls.Rd @@ -179,18 +179,30 @@ data(liver.toxicity) X <- liver.toxicity$gene Y <- liver.toxicity$clinic -# tune pls model to find optimal number of components -tune.res <- tune.pls( X, Y, ncomp = 10, measure = "cor", - folds = 5, nrepeat = 3, progressBar = TRUE) +# tune PLS2 model to find optimal number of components +tune.res <- tune.pls(X, Y, ncomp = 10, measure = "cor", + folds = 5, nrepeat = 3, progressBar = TRUE) plot(tune.res) # plot outputs -# PLS1 model +# PLS1 model example Y1 <- liver.toxicity$clinic[,1] -tune.res <- tune.pls( X, Y1, ncomp = 10, measure = "cor", - folds = 5, nrepeat = 3, progressBar = TRUE) +tune.res <- tune.pls(X, Y1, ncomp = 10, measure = "cor", + folds = 5, nrepeat = 3, progressBar = TRUE) plot(tune.res) + +# Multilevel PLS2 model +repeat.indiv <- c(1, 2, 1, 2, 1, 2, 1, 2, 3, 3, 4, 3, 4, 3, 4, 4, 5, 6, 5, 5, + 6, 5, 6, 7, 7, 8, 6, 7, 8, 7, 8, 8, 9, 10, 9, 10, 11, 9, 9, + 10, 11, 12, 12, 10, 11, 12, 11, 12, 13, 14, 13, 14, 13, 14, + 13, 14, 15, 16, 15, 16, 15, 16, 15, 16) +design <- data.frame(sample = repeat.indiv) + +tune.res <- tune.pls(X, Y1, ncomp = 10, measure = "cor", multilevel = design, + folds = 5, nrepeat = 3, progressBar = TRUE) + +plot(tune) } \references{ mixOmics article: diff --git a/man/tune.spls.Rd b/man/tune.spls.Rd index f18b866f..ba6f0c0f 100644 --- a/man/tune.spls.Rd +++ b/man/tune.spls.Rd @@ -195,7 +195,7 @@ See also \code{?perf} for more details. } \examples{ -## sPLS2 model (i.e. more than one Y outcome variable) +## sPLS2 model example (more than one Y outcome variable) # set up data data(liver.toxicity) @@ -211,25 +211,49 @@ plot(tune.res.ncomp) # plot outputs # tune spls model for number of X and Y variables to keep tune.res <- tune.spls( X, Y, ncomp = 3, - test.keepX = c(5, 10, 15), - test.keepY = c(3, 6, 8), measure = "cor", - folds = 5, nrepeat = 3, progressBar = TRUE) + test.keepX = c(5, 10, 15), + test.keepY = c(3, 6, 8), measure = "cor", + folds = 5, nrepeat = 3, progressBar = TRUE) plot(tune.res) # plot outputs -## sPLS1 model (i.e. only one Y outcome variable) +## sPLS1 model example (only one Y outcome variable) # set up data Y1 <- liver.toxicity$clinic[,1] # tune spls model for components only plot(tune.spls(X, Y1, ncomp = 3, -folds = 3, -test.keepX = NULL, test.keepY = NULL)) + folds = 3, + test.keepX = NULL, test.keepY = NULL)) # tune spls model for number of X variables to keep, note for sPLS1 models 'measure' needs to be set plot(tune.spls(X, Y1, ncomp = 3, -folds = 3, measure = "MSE", -test.keepX = c(5, 10, 15), test.keepY = c(3, 6, 8))) + folds = 3, measure = "MSE", + test.keepX = c(5, 10, 15), test.keepY = c(3, 6, 8))) + + +## sPLS2 multilevel model example + +# set up multilevel design +repeat.indiv <- c(1, 2, 1, 2, 1, 2, 1, 2, 3, 3, 4, 3, 4, 3, 4, 4, 5, 6, 5, 5, + 6, 5, 6, 7, 7, 8, 6, 7, 8, 7, 8, 8, 9, 10, 9, 10, 11, 9, 9, + 10, 11, 12, 12, 10, 11, 12, 11, 12, 13, 14, 13, 14, 13, 14, + 13, 14, 15, 16, 15, 16, 15, 16, 15, 16) +design <- data.frame(sample = repeat.indiv) + +# tune spls model for components only +tune.res.ncomp <- tune.spls( X, Y, ncomp = 5, + test.keepX = NULL, + test.keepY = NULL, measure = "cor", multilevel = design, + folds = 5, nrepeat = 3, progressBar = TRUE) +plot(tune.res.ncomp) # plot outputs + +# tune spls model for number of X and Y variables to keep +tune.res <- tune.spls( X, Y, ncomp = 3, + test.keepX = c(5, 10, 15), + test.keepY = c(3, 6, 8), measure = "cor", multilevel = design, + folds = 5, nrepeat = 3, progressBar = TRUE) +plot(tune.res) # plot outputs } \references{ mixOmics article: From 9bd6753db38f4437d25fb79c4f5e6bfaff4e3355 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Thu, 28 Nov 2024 12:22:03 +1100 Subject: [PATCH 70/75] update tune.plsda to work on multilevel and add example --- R/tune.plsda.R | 3 --- examples/tune.plsda-examples.R | 23 ++++++++++++++++------- man/tune.plsda.Rd | 21 +++++++++++++++------ 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/R/tune.plsda.R b/R/tune.plsda.R index 34e74cc7..25504f88 100644 --- a/R/tune.plsda.R +++ b/R/tune.plsda.R @@ -194,9 +194,6 @@ tune.plsda <- if (!is.null(ncol(Y)) && !ncol(Y) %in% c(0,1,2))# multilevel 1 or 2 factors stop("'Y' should either be a factor, a single column data.frame containing a factor, or a 2-columns data.frame containing 2 factors.") - multilevel = data.frame(multilevel, Y) - multilevel[, 1] = as.numeric(factor(multilevel[, 1])) # we want numbers for the repeated measurements - } diff --git a/examples/tune.plsda-examples.R b/examples/tune.plsda-examples.R index 91cb039f..b9a64cdb 100644 --- a/examples/tune.plsda-examples.R +++ b/examples/tune.plsda-examples.R @@ -1,12 +1,21 @@ ## Example: analysis with PLS-DA data(breast.tumors) -X = breast.tumors$gene.exp -Y = as.factor(breast.tumors$sample$treatment) # tune components and distance -tune = tune.plsda(X, Y, ncomp = 5, logratio = "none", - nrepeat = 10, folds = 10, - progressBar = TRUE, - seed = 20) # set for reproducibility of example only +tune = tune.plsda(breast.tumors$gene.exp, as.factor(breast.tumors$sample$treatment), + ncomp = 5, logratio = "none", + nrepeat = 10, folds = 10, + progressBar = TRUE, + seed = 20) # set for reproducibility of example only plot(tune) # optimal distance = centroids.dist -tune$choice.ncomp # optimal component number = 3 \ No newline at end of file +tune$choice.ncomp # optimal component number = 3 + +## Example: multilevel PLS-DA +data(vac18) +design <- data.frame(sample = vac18$sample) # set the multilevel design + +tune1 <- tune.plsda(vac18$genes, vac18$stimulation, + ncomp = 5, multilevel = design, + nrepeat = 10, folds = 10, + seed = 20) +plot(tune1) \ No newline at end of file diff --git a/man/tune.plsda.Rd b/man/tune.plsda.Rd index 63dff237..e6c7cf2b 100644 --- a/man/tune.plsda.Rd +++ b/man/tune.plsda.Rd @@ -155,16 +155,25 @@ for more details see the perf() help pages. \examples{ ## Example: analysis with PLS-DA data(breast.tumors) -X = breast.tumors$gene.exp -Y = as.factor(breast.tumors$sample$treatment) # tune components and distance -tune = tune.plsda(X, Y, ncomp = 5, logratio = "none", - nrepeat = 10, folds = 10, - progressBar = TRUE, - seed = 20) # set for reproducibility of example only +tune = tune.plsda(breast.tumors$gene.exp, as.factor(breast.tumors$sample$treatment), + ncomp = 5, logratio = "none", + nrepeat = 10, folds = 10, + progressBar = TRUE, + seed = 20) # set for reproducibility of example only plot(tune) # optimal distance = centroids.dist tune$choice.ncomp # optimal component number = 3 + +## Example: multilevel PLS-DA +data(vac18) +design <- data.frame(sample = vac18$sample) # set the multilevel design + +tune1 <- tune.plsda(vac18$genes, vac18$stimulation, + ncomp = 5, multilevel = design, + nrepeat = 10, folds = 10, + seed = 20) +plot(tune1) } \references{ mixOmics article: From c9961b38dc93f7eb079e4480042051a937421791 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Thu, 28 Nov 2024 12:28:28 +1100 Subject: [PATCH 71/75] tune.splsda take multilevel and add exampel --- R/tune.splsda.R | 7 ++++--- examples/tune.splsda-examples.R | 13 ++++++++++--- man/tune.splsda.Rd | 11 +++++++++-- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/R/tune.splsda.R b/R/tune.splsda.R index bc6ce44a..3f38e879 100644 --- a/R/tune.splsda.R +++ b/R/tune.splsda.R @@ -235,9 +235,6 @@ tune.splsda <- if (!is.null(ncol(Y)) && !ncol(Y) %in% c(0,1,2))# multilevel 1 or 2 factors stop("'Y' should either be a factor, a single column data.frame containing a factor, or a 2-columns data.frame containing 2 factors.") - multilevel = data.frame(multilevel, Y) - multilevel[, 1] = as.numeric(factor(multilevel[, 1])) # we want numbers for the repeated measurements - } @@ -309,6 +306,10 @@ tune.splsda <- return(perf_res) #-> if test.keepX is not NULL, run tune function as before } else { + + # reformat multilevel design for tuning variables + multilevel = data.frame(multilevel, Y) + multilevel[, 1] = as.numeric(factor(multilevel[, 1])) # we want numbers for the repeated measurements #-- test.keepX if (length(test.keepX) == 1 | !is.numeric(test.keepX)) diff --git a/examples/tune.splsda-examples.R b/examples/tune.splsda-examples.R index 801e3681..36e2d585 100644 --- a/examples/tune.splsda-examples.R +++ b/examples/tune.splsda-examples.R @@ -7,6 +7,7 @@ Y = as.factor(breast.tumors$sample$treatment) tune = tune.splsda(X, Y, ncomp = 5, logratio = "none", nrepeat = 10, folds = 10, test.keepX = NULL, + dist = "all", progressBar = TRUE, seed = 20) # set for reproducibility of example only plot(tune) # optimal distance = centroids.dist @@ -22,7 +23,7 @@ tune = tune.splsda(X, Y, ncomp = 3, logratio = "none", plot(tune) tune$choice.keepX # optimal number of variables to keep c(15, 5, 15) -\dontrun{ +## With already tested variables: tune = tune.splsda(X, Y, ncomp = 3, logratio = "none", nrepeat = 10, folds = 10, test.keepX = c(5, 10, 15), already.tested.X = c(5, 10), @@ -39,8 +40,14 @@ Y = vac18$stimulation # sample indicates the repeated measurements design = data.frame(sample = vac18$sample) +# tune on components +tune = tune.splsda(X, Y = Y, ncomp = 5, nrepeat = 10, logratio = "none", + test.keepX = NULL, folds = 10, dist = "max.dist", multilevel = design) + +plot(tune) + +# tune on variables tune = tune.splsda(X, Y = Y, ncomp = 3, nrepeat = 10, logratio = "none", test.keepX = c(5,50,100),folds = 10, dist = "max.dist", multilevel = design) -plot(tune) -} \ No newline at end of file +plot(tune) \ No newline at end of file diff --git a/man/tune.splsda.Rd b/man/tune.splsda.Rd index 770da82f..cfe20a24 100644 --- a/man/tune.splsda.Rd +++ b/man/tune.splsda.Rd @@ -209,6 +209,7 @@ Y = as.factor(breast.tumors$sample$treatment) tune = tune.splsda(X, Y, ncomp = 5, logratio = "none", nrepeat = 10, folds = 10, test.keepX = NULL, + dist = "all", progressBar = TRUE, seed = 20) # set for reproducibility of example only plot(tune) # optimal distance = centroids.dist @@ -224,7 +225,7 @@ tune = tune.splsda(X, Y, ncomp = 3, logratio = "none", plot(tune) tune$choice.keepX # optimal number of variables to keep c(15, 5, 15) -\dontrun{ +## With already tested variables: tune = tune.splsda(X, Y, ncomp = 3, logratio = "none", nrepeat = 10, folds = 10, test.keepX = c(5, 10, 15), already.tested.X = c(5, 10), @@ -241,12 +242,18 @@ Y = vac18$stimulation # sample indicates the repeated measurements design = data.frame(sample = vac18$sample) +# tune on components +tune = tune.splsda(X, Y = Y, ncomp = 5, nrepeat = 10, logratio = "none", + test.keepX = NULL, folds = 10, dist = "max.dist", multilevel = design) + +plot(tune) + +# tune on variables tune = tune.splsda(X, Y = Y, ncomp = 3, nrepeat = 10, logratio = "none", test.keepX = c(5,50,100),folds = 10, dist = "max.dist", multilevel = design) plot(tune) } -} \references{ mixOmics article: From 49e1b9dd94a435c25ba97c2bcf896b412c8436b5 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Thu, 28 Nov 2024 13:22:00 +1100 Subject: [PATCH 72/75] install mixomics from github inside the vignette --- vignettes/vignette.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/vignette.Rmd b/vignettes/vignette.Rmd index eac384f5..c677c9d4 100644 --- a/vignettes/vignette.Rmd +++ b/vignettes/vignette.Rmd @@ -279,7 +279,7 @@ if (!requireNamespace("BiocManager", quietly = TRUE)) Alternatively, you can install the latest GitHub version of the package: -```{r 02-install-github, eval = FALSE} +```{r 02-install-github, eval = TRUE} BiocManager::install("mixOmicsTeam/mixOmics") ``` From 036c422ac3e89518b0bafcc6e839f362574a35a8 Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Thu, 28 Nov 2024 14:10:26 +1100 Subject: [PATCH 73/75] fixed tune.splsda multilevel error --- R/tune.splsda.R | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/R/tune.splsda.R b/R/tune.splsda.R index 3f38e879..cbbd143c 100644 --- a/R/tune.splsda.R +++ b/R/tune.splsda.R @@ -308,9 +308,11 @@ tune.splsda <- } else { # reformat multilevel design for tuning variables - multilevel = data.frame(multilevel, Y) - multilevel[, 1] = as.numeric(factor(multilevel[, 1])) # we want numbers for the repeated measurements - + if (!is.null(multilevel)){ + multilevel = data.frame(multilevel, Y) + multilevel[, 1] = as.numeric(factor(multilevel[, 1])) # we want numbers for the repeated measurements + } + #-- test.keepX if (length(test.keepX) == 1 | !is.numeric(test.keepX)) stop("'test.keepX' must be a numeric vector with more than two entries", call. = FALSE) From ce7f59affdbf3f6ff59abb271af14335049d6adc Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Thu, 28 Nov 2024 14:16:47 +1100 Subject: [PATCH 74/75] revert back vignette change --- vignettes/vignette.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/vignette.Rmd b/vignettes/vignette.Rmd index c677c9d4..eac384f5 100644 --- a/vignettes/vignette.Rmd +++ b/vignettes/vignette.Rmd @@ -279,7 +279,7 @@ if (!requireNamespace("BiocManager", quietly = TRUE)) Alternatively, you can install the latest GitHub version of the package: -```{r 02-install-github, eval = TRUE} +```{r 02-install-github, eval = FALSE} BiocManager::install("mixOmicsTeam/mixOmics") ``` From dcda5b5b4b5e57cfb5ac57a726e4c71252236cad Mon Sep 17 00:00:00 2001 From: Eva Hamrud <50098063+evaham1@users.noreply.github.com> Date: Thu, 28 Nov 2024 14:20:19 +1100 Subject: [PATCH 75/75] typo in example --- examples/tune.pls-examples.R | 2 +- man/tune.pls.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/tune.pls-examples.R b/examples/tune.pls-examples.R index cb945cc0..bca5c900 100644 --- a/examples/tune.pls-examples.R +++ b/examples/tune.pls-examples.R @@ -26,4 +26,4 @@ design <- data.frame(sample = repeat.indiv) tune.res <- tune.pls(X, Y1, ncomp = 10, measure = "cor", multilevel = design, folds = 5, nrepeat = 3, progressBar = TRUE) -plot(tune) \ No newline at end of file +plot(tune.res) \ No newline at end of file diff --git a/man/tune.pls.Rd b/man/tune.pls.Rd index 18b20e75..866953f2 100644 --- a/man/tune.pls.Rd +++ b/man/tune.pls.Rd @@ -202,7 +202,7 @@ design <- data.frame(sample = repeat.indiv) tune.res <- tune.pls(X, Y1, ncomp = 10, measure = "cor", multilevel = design, folds = 5, nrepeat = 3, progressBar = TRUE) -plot(tune) +plot(tune.res) } \references{ mixOmics article: