From 9a128f4cfb1d1d294fe350a239530d0e40ea702d Mon Sep 17 00:00:00 2001 From: david-cortes Date: Mon, 16 Dec 2024 20:29:14 +0100 Subject: [PATCH 1/6] add remaining arguments to xgboost() --- R-package/R/xgb.train.R | 3 +- R-package/R/xgboost.R | 170 ++++++++++---- R-package/man/xgb.params.Rd | 3 +- R-package/man/xgboost.Rd | 442 ++++++++++++++++++++++++++++++++++-- 4 files changed, 550 insertions(+), 68 deletions(-) diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index cd13d82b172e..b8ec844e4974 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -402,7 +402,8 @@ xgb.train <- function(params = xgb.params(), data, nrounds, evals = list(), #' If passing `NULL` for a given parameter (the default for all of them), then the default #' value for that parameter will be used. Default values are automatically determined by the #' XGBoost core library upon calls to [xgb.train()] or [xgb.cv()], and are subject to change -#' over XGBoost library versions. +#' over XGBoost library versions. Some of them might differ according to the +#' booster type (e.g. defaults for regularization are different for linear and tree-based boosters). #' @return A list with the entries that were passed non-NULL values. It is intended to #' be passed as argument `params` to [xgb.train()] or [xgb.cv()]. #' @export diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index b62c25266269..fbf7cc4b01aa 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -1,25 +1,3 @@ -prescreen.parameters <- function(params) { - if (!NROW(params)) { - return(list()) - } - if (!is.list(params)) { - stop("'params' must be a list or NULL.") - } - - params <- params[!is.null(params)] - - if ("num_class" %in% names(params)) { - stop("'num_class' cannot be manually specified for 'xgboost()'. Pass a factor 'y' instead.") - } - if ("process_type" %in% names(params)) { - if (params$process_type != "default") { - stop("Non-default 'process_type' is not supported for 'xgboost()'. Try 'xgb.train()'.") - } - } - - return(params) -} - prescreen.objective <- function(objective) { if (!is.null(objective)) { if (!is.character(objective) || length(objective) != 1L || is.na(objective)) { @@ -863,6 +841,10 @@ check.early.stopping.rounds <- function(early_stopping_rounds, eval_set) { #' See [xgb.train()] for a more flexible low-level alternative which is similar across different #' language bindings of XGBoost and which exposes the full library's functionalities. #' +#' By default, most of the parameters here have a value of `NULL`, which signals XGBoost to use its +#' default value. Default values are automatically determined by the XGBoost core library, and are +#' subject to change over XGBoost library versions. Some of them might differ according to the +#' booster type (e.g. defaults for regularization are different for linear and tree-based boosters). #' @details #' For package authors using 'xgboost' as a dependency, it is highly recommended to use #' [xgb.train()] in package code instead of [xgboost()], since it has a more stable interface @@ -906,26 +888,45 @@ check.early.stopping.rounds <- function(early_stopping_rounds, eval_set) { #' class instead of to the first factor level. If `y` is a `logical` vector, then `TRUE` will be #' set as the last level. #' @param objective Optimization objective to minimize based on the supplied data, to be passed -#' by name as a string / character (e.g. `reg:absoluteerror`). See the -#' [Learning Task Parameters](https://xgboost.readthedocs.io/en/stable/parameter.html#learning-task-parameters) -#' page and the [xgb.params()] documentation for more detailed information on allowed values. -#' -#' If `NULL` (the default), will be automatically determined from `y` according to the following -#' logic: -#' - If `y` is a factor with 2 levels, will use `binary:logistic`. -#' - If `y` is a factor with more than 2 levels, will use `multi:softprob` (number of classes -#' will be determined automatically, should not be passed under `params`). -#' - If `y` is a `Surv` object from the `survival` package, will use `survival:aft` (note that -#' the only types supported are left / right / interval censored). -#' - Otherwise, will use `reg:squarederror`. -#' -#' If `objective` is not `NULL`, it must match with the type of `y` - e.g. `factor` types of `y` -#' can only be used with classification objectives and vice-versa. -#' -#' Note that not all possible `objective` values supported by the core XGBoost library are allowed -#' here - for example, objectives which are a variation of another but with a different default -#' prediction type (e.g. `multi:softmax` vs. `multi:softprob`) are not allowed, and neither are -#' ranking objectives, nor custom objectives at the moment. +#' by name as a string / character (e.g. `reg:absoluteerror`). See the +#' [Learning Task Parameters](https://xgboost.readthedocs.io/en/stable/parameter.html#learning-task-parameters) +#' page and the [xgb.params()] documentation for more detailed information on allowed values. +#' +#' If `NULL` (the default), will be automatically determined from `y` according to the following +#' logic: +#' - If `y` is a factor with 2 levels, will use `binary:logistic`. +#' - If `y` is a factor with more than 2 levels, will use `multi:softprob` (number of classes +#' will be determined automatically, should not be passed under `params`). +#' - If `y` is a `Surv` object from the `survival` package, will use `survival:aft` (note that +#' the only types supported are left / right / interval censored). +#' - Otherwise, will use `reg:squarederror`. +#' +#' If `objective` is not `NULL`, it must match with the type of `y` - e.g. `factor` types of `y` +#' can only be used with classification objectives and vice-versa. +#' +#' Note that not all possible `objective` values supported by the core XGBoost library are allowed +#' here - for example, objectives which are a variation of another but with a different default +#' prediction type (e.g. `multi:softmax` vs. `multi:softprob`) are not allowed, and neither are +#' ranking objectives, nor custom objectives at the moment. +#' +#' Supported values are: +#' - `"reg:squarederror"`: regression with squared loss. +#' - `"reg:squaredlogerror"`: regression with squared log loss \eqn{\frac{1}{2}[log(pred + 1) - log(label + 1)]^2}. All input labels are required to be greater than -1. Also, see metric `rmsle` for possible issue with this objective. +#' - `"reg:pseudohubererror"`: regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss. +#' - `"reg:absoluteerror"`: Regression with L1 error. When tree model is used, leaf value is refreshed after tree construction. If used in distributed training, the leaf value is calculated as the mean value from all workers, which is not guaranteed to be optimal. +#' - `"reg:quantileerror"`: Quantile loss, also known as "pinball loss". See later sections for its parameter and [Quantile Regression](https://xgboost.readthedocs.io/en/latest/python/examples/quantile_regression.html#sphx-glr-python-examples-quantile-regression-py) for a worked example. +#' - `"binary:logistic"`: logistic regression for binary classification, output probability +#' - `"binary:hinge"`: hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities. +#' - `"count:poisson"`: Poisson regression for count data, output mean of Poisson distribution. +#' `"max_delta_step"` is set to 0.7 by default in Poisson regression (used to safeguard optimization) +#' - `"survival:cox"`: Cox regression for right censored survival time data (negative values are considered right censored). +#' +#' Note that predictions are returned on the hazard ratio scale (i.e., as HR = exp(marginal_prediction) in the proportional hazard function `h(t) = h0(t) * HR`). +#' - `"survival:aft"`: Accelerated failure time model for censored survival time data. +#' See [Survival Analysis with Accelerated Failure Time](https://xgboost.readthedocs.io/en/latest/tutorials/aft_survival_analysis.html) for details. +#' - `"multi:softprob"`: multi-class classification throgh multinomial logistic likelihood. +#' - `"reg:gamma"`: gamma regression with log-link. Output is a mean of gamma distribution. It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be [gamma-distributed](https://en.wikipedia.org/wiki/Gamma_distribution#Occurrence_and_applications). +#' - `"reg:tweedie"`: Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be [Tweedie-distributed](https://en.wikipedia.org/wiki/Tweedie_distribution#Occurrence_and_applications). #' @param nrounds Number of boosting iterations / rounds. #' #' Note that the number of default boosting rounds here is not automatically tuned, and different @@ -1022,6 +1023,32 @@ check.early.stopping.rounds <- function(early_stopping_rounds, eval_set) { #' #' If `NULL`, will start from zero, but note that for most objectives, an intercept is usually #' added (controllable through parameter `base_score` instead) when `base_margin` is not passed. +#' @param min_split_loss (for Tree Booster) (default=0, alias: `gamma`) +#' Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger `min_split_loss` is, the more conservative the algorithm will be. Note that a tree where no splits were made might still contain a single terminal node with a non-zero score. +#' +#' range: \eqn{[0, \infty)} +#' @param learning_rate (default=0.3, alias: `eta`) +#' Step size shrinkage used in update to prevent overfitting. After each boosting step, we can directly get the weights of new features, and `learning_rate` shrinks the feature weights to make the boosting process more conservative. +#' +#' range: \eqn{[0,1]} +#' @param reg_lambda (alias: `lambda`) +#' - For tree-based boosters: +#' - L2 regularization term on weights. Increasing this value will make model more conservative. +#' - default: 1 +#' - range: \eqn{[0, \infty]} +#' - For linear booster: +#' - L2 regularization term on weights. Increasing this value will make model more conservative. Normalised to number of training examples. +#' - default: 0 +#' - range: \eqn{[0, \infty)} +#' @param reg_alpha (alias: `reg_alpha`) +#' - L1 regularization term on weights. Increasing this value will make model more conservative. +#' - For the linear booster, it's normalised to number of training examples. +#' - default: 0 +#' - range: \eqn{[0, \infty)} +#' @param updater (for Linear Booster) (default= `"shotgun"`) +#' Choice of algorithm to fit linear model +#' - `"shotgun"`: Parallel coordinate descent algorithm based on shotgun algorithm. Uses 'hogwild' parallelism and therefore produces a nondeterministic solution on each run. +#' - `"coord_descent"`: Ordinary coordinate descent algorithm. Also multithreaded but still produces a deterministic solution. When the `device` parameter is set to `"cuda"` or `"gpu"`, a GPU variant would be used. #' @param ... Other training parameters. See the online documentation #' [XGBoost Parameters](https://xgboost.readthedocs.io/en/stable/parameter.html) for #' details about possible values and what they do. @@ -1029,6 +1056,7 @@ check.early.stopping.rounds <- function(early_stopping_rounds, eval_set) { #' Note that not all possible values from the core XGBoost library are allowed as `params` for #' 'xgboost()' - in particular, values which require an already-fitted booster object (such as #' `process_type`) are not accepted here. +#' @inheritParams xgb.params #' @return A model object, inheriting from both `xgboost` and `xgb.Booster`. Compared to the regular #' `xgb.Booster` model class produced by [xgb.train()], this `xgboost` class will have an #' @@ -1051,23 +1079,70 @@ xgboost <- function( y, objective = NULL, nrounds = 100L, + max_depth = NULL, + learning_rate = NULL, + min_child_weight = NULL, + min_split_loss = NULL, + reg_lambda = NULL, weights = NULL, verbosity = if (is.null(eval_set)) 0L else 1L, monitor_training = verbosity > 0, eval_set = NULL, early_stopping_rounds = NULL, print_every_n = 1L, + eval_metric = NULL, nthreads = parallel::detectCores(), seed = 0L, + base_margin = NULL, monotone_constraints = NULL, interaction_constraints = NULL, + reg_alpha = NULL, + max_bin = NULL, + max_leaves = NULL, + booster = NULL, + subsample = NULL, + sampling_method = NULL, feature_weights = NULL, - base_margin = NULL, - ... + colsample_bytree = NULL, + colsample_bylevel = NULL, + colsample_bynode = NULL, + tree_method = NULL, + max_delta_step = NULL, + scale_pos_weight = NULL, + updater = NULL, + grow_policy = NULL, + num_parallel_tree = NULL, + multi_strategy = NULL, + base_score = NULL, + seed_per_iteration = NULL, + device = NULL, + disable_default_eval_metric = NULL, + use_rmm = NULL, + max_cached_hist_node = NULL, + extmem_single_page = NULL, + max_cat_to_onehot = NULL, + max_cat_threshold = NULL, + sample_type = NULL, + normalize_type = NULL, + rate_drop = NULL, + one_drop = NULL, + skip_drop = NULL, + feature_selector = NULL, + top_k = NULL, + tweedie_variance_power = NULL, + huber_slope = NULL, + quantile_alpha = NULL, + aft_loss_distribution = NULL ) { - # Note: '...' is a workaround, to be removed later by making all parameters be arguments - params <- list(...) - params <- prescreen.parameters(params) + params <- as.list(environment()) + params <- params[ + (names(params) %in% formalArgs(xgb.params)) + & !sapply(params, is.null) + & !(names(params) %in% c( # these undergo additional processing here + "objective", "base_margin", "monotone_constraints", "interaction_constraints" + )) + ] + prescreen.objective(objective) use_qdm <- check.can.use.qdm(x, params, eval_set) lst_args <- process.y.margin.and.objective(y, base_margin, objective, params) @@ -1089,7 +1164,6 @@ xgboost <- function( nthreads <- check.nthreads(nthreads) lst_args$dmatrix_args$nthread <- nthreads lst_args$params$nthread <- nthreads - lst_args$params$seed <- seed params <- c(lst_args$params, params) params$verbosity <- verbosity diff --git a/R-package/man/xgb.params.Rd b/R-package/man/xgb.params.Rd index 051fba6c8bd0..418efc32b72d 100644 --- a/R-package/man/xgb.params.Rd +++ b/R-package/man/xgb.params.Rd @@ -535,5 +535,6 @@ values. Note that this function will not perform any validation on the supplied If passing \code{NULL} for a given parameter (the default for all of them), then the default value for that parameter will be used. Default values are automatically determined by the XGBoost core library upon calls to \code{\link[=xgb.train]{xgb.train()}} or \code{\link[=xgb.cv]{xgb.cv()}}, and are subject to change -over XGBoost library versions. +over XGBoost library versions. Some of them might differ according to the +booster type (e.g. defaults for regularization are different for linear and tree-based boosters). } diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index 058090e1ad1f..61a1c290479d 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -9,19 +9,60 @@ xgboost( y, objective = NULL, nrounds = 100L, + max_depth = NULL, + learning_rate = NULL, + min_child_weight = NULL, + min_split_loss = NULL, + reg_lambda = NULL, weights = NULL, verbosity = if (is.null(eval_set)) 0L else 1L, monitor_training = verbosity > 0, eval_set = NULL, early_stopping_rounds = NULL, print_every_n = 1L, + eval_metric = NULL, nthreads = parallel::detectCores(), seed = 0L, + base_margin = NULL, monotone_constraints = NULL, interaction_constraints = NULL, + reg_alpha = NULL, + max_bin = NULL, + max_leaves = NULL, + booster = NULL, + subsample = NULL, + sampling_method = NULL, feature_weights = NULL, - base_margin = NULL, - ... + colsample_bytree = NULL, + colsample_bylevel = NULL, + colsample_bynode = NULL, + tree_method = NULL, + max_delta_step = NULL, + scale_pos_weight = NULL, + updater = NULL, + grow_policy = NULL, + num_parallel_tree = NULL, + multi_strategy = NULL, + base_score = NULL, + seed_per_iteration = NULL, + device = NULL, + disable_default_eval_metric = NULL, + use_rmm = NULL, + max_cached_hist_node = NULL, + extmem_single_page = NULL, + max_cat_to_onehot = NULL, + max_cat_threshold = NULL, + sample_type = NULL, + normalize_type = NULL, + rate_drop = NULL, + one_drop = NULL, + skip_drop = NULL, + feature_selector = NULL, + top_k = NULL, + tweedie_variance_power = NULL, + huber_slope = NULL, + quantile_alpha = NULL, + aft_loss_distribution = NULL ) } \arguments{ @@ -88,13 +129,70 @@ can only be used with classification objectives and vice-versa. Note that not all possible \code{objective} values supported by the core XGBoost library are allowed here - for example, objectives which are a variation of another but with a different default prediction type (e.g. \code{multi:softmax} vs. \code{multi:softprob}) are not allowed, and neither are -ranking objectives, nor custom objectives at the moment.} +ranking objectives, nor custom objectives at the moment. + +Supported values are: +\itemize{ +\item \code{"reg:squarederror"}: regression with squared loss. +\item \code{"reg:squaredlogerror"}: regression with squared log loss \eqn{\frac{1}{2}[log(pred + 1) - log(label + 1)]^2}. All input labels are required to be greater than -1. Also, see metric \code{rmsle} for possible issue with this objective. +\item \code{"reg:pseudohubererror"}: regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss. +\item \code{"reg:absoluteerror"}: Regression with L1 error. When tree model is used, leaf value is refreshed after tree construction. If used in distributed training, the leaf value is calculated as the mean value from all workers, which is not guaranteed to be optimal. +\item \code{"reg:quantileerror"}: Quantile loss, also known as "pinball loss". See later sections for its parameter and \href{https://xgboost.readthedocs.io/en/latest/python/examples/quantile_regression.html#sphx-glr-python-examples-quantile-regression-py}{Quantile Regression} for a worked example. +\item \code{"binary:logistic"}: logistic regression for binary classification, output probability +\item \code{"binary:hinge"}: hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities. +\item \code{"count:poisson"}: Poisson regression for count data, output mean of Poisson distribution. +\code{"max_delta_step"} is set to 0.7 by default in Poisson regression (used to safeguard optimization) +\item \code{"survival:cox"}: Cox regression for right censored survival time data (negative values are considered right censored). + +Note that predictions are returned on the hazard ratio scale (i.e., as HR = exp(marginal_prediction) in the proportional hazard function \code{h(t) = h0(t) * HR}). +\item \code{"survival:aft"}: Accelerated failure time model for censored survival time data. +See \href{https://xgboost.readthedocs.io/en/latest/tutorials/aft_survival_analysis.html}{Survival Analysis with Accelerated Failure Time} for details. +\item \code{"multi:softprob"}: multi-class classification throgh multinomial logistic likelihood. +\item \code{"reg:gamma"}: gamma regression with log-link. Output is a mean of gamma distribution. It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be \href{https://en.wikipedia.org/wiki/Gamma_distribution#Occurrence_and_applications}{gamma-distributed}. +\item \code{"reg:tweedie"}: Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be \href{https://en.wikipedia.org/wiki/Tweedie_distribution#Occurrence_and_applications}{Tweedie-distributed}. +}} \item{nrounds}{Number of boosting iterations / rounds. Note that the number of default boosting rounds here is not automatically tuned, and different problems will have vastly different optimal numbers of boosting rounds.} +\item{max_depth}{(for Tree Booster) (default=6) +Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit. 0 indicates no limit on depth. Beware that XGBoost aggressively consumes memory when training a deep tree. \code{"exact"} tree method requires non-zero value. + +range: \eqn{[0, \infty)}} + +\item{learning_rate}{(default=0.3, alias: \code{eta}) +Step size shrinkage used in update to prevent overfitting. After each boosting step, we can directly get the weights of new features, and \code{learning_rate} shrinks the feature weights to make the boosting process more conservative. + +range: \eqn{[0,1]}} + +\item{min_child_weight}{(for Tree Booster) (default=1) +Minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than \code{min_child_weight}, then the building process will give up further partitioning. In linear regression task, this simply corresponds to minimum number of instances needed to be in each node. The larger \code{min_child_weight} is, the more conservative the algorithm will be. + +range: \eqn{[0, \infty)}} + +\item{min_split_loss}{(for Tree Booster) (default=0, alias: \code{gamma}) +Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger \code{min_split_loss} is, the more conservative the algorithm will be. Note that a tree where no splits were made might still contain a single terminal node with a non-zero score. + +range: \eqn{[0, \infty)}} + +\item{reg_lambda}{(alias: \code{lambda}) +\itemize{ +\item For tree-based boosters: +\itemize{ +\item L2 regularization term on weights. Increasing this value will make model more conservative. +\item default: 1 +\item range: \eqn{[0, \infty]} +} +\item For linear booster: +\itemize{ +\item L2 regularization term on weights. Increasing this value will make model more conservative. Normalised to number of training examples. +\item default: 0 +\item range: \eqn{[0, \infty)} +} +}} + \item{weights}{Sample weights for each row in \code{x} and \code{y}. If \code{NULL} (the default), each row will have the same weight. @@ -140,11 +238,83 @@ included regardless of this 'n'. Only has an effect when passing \code{verbosity>0}.} +\item{eval_metric}{(default according to objective) +\itemize{ +\item Evaluation metrics for validation data, a default metric will be assigned according to objective (rmse for regression, and logloss for classification, \verb{mean average precision} for \code{rank:map}, etc.) +\item User can add multiple evaluation metrics. +\item The choices are listed below: +\itemize{ +\item \code{"rmse"}: \href{http://en.wikipedia.org/wiki/Root_mean_square_error}{root mean square error} +\item \code{"rmsle"}: root mean square log error: \eqn{\sqrt{\frac{1}{N}[log(pred + 1) - log(label + 1)]^2}}. Default metric of \code{"reg:squaredlogerror"} objective. This metric reduces errors generated by outliers in dataset. But because \code{log} function is employed, \code{"rmsle"} might output \code{nan} when prediction value is less than -1. See \code{"reg:squaredlogerror"} for other requirements. +\item \code{"mae"}: \href{https://en.wikipedia.org/wiki/Mean_absolute_error}{mean absolute error} +\item \code{"mape"}: \href{https://en.wikipedia.org/wiki/Mean_absolute_percentage_error}{mean absolute percentage error} +\item \code{"mphe"}: \href{https://en.wikipedia.org/wiki/Huber_loss}{mean Pseudo Huber error}. Default metric of \code{"reg:pseudohubererror"} objective. +\item \code{"logloss"}: \href{http://en.wikipedia.org/wiki/Log-likelihood}{negative log-likelihood} +\item \code{"error"}: Binary classification error rate. It is calculated as \verb{#(wrong cases)/#(all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances. +\item \code{"error@t"}: a different than 0.5 binary classification threshold value could be specified by providing a numerical value through 't'. +\item \code{"merror"}: Multiclass classification error rate. It is calculated as \verb{#(wrong cases)/#(all cases)}. +\item \code{"mlogloss"}: \href{http://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html}{Multiclass logloss}. +\item \code{"auc"}: \href{https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve}{Receiver Operating Characteristic Area under the Curve}. +Available for classification and learning-to-rank tasks. +\itemize{ +\item When used with binary classification, the objective should be \code{"binary:logistic"} or similar functions that work on probability. +\item When used with multi-class classification, objective should be \code{"multi:softprob"} instead of \code{"multi:softmax"}, as the latter doesn't output probability. Also the AUC is calculated by 1-vs-rest with reference class weighted by class prevalence. +\item When used with LTR task, the AUC is computed by comparing pairs of documents to count correctly sorted pairs. This corresponds to pairwise learning to rank. The implementation has some issues with average AUC around groups and distributed workers not being well-defined. +\item On a single machine the AUC calculation is exact. In a distributed environment the AUC is a weighted average over the AUC of training rows on each node - therefore, distributed AUC is an approximation sensitive to the distribution of data across workers. Use another metric in distributed environments if precision and reproducibility are important. +\item When input dataset contains only negative or positive samples, the output is \code{NaN}. The behavior is implementation defined, for instance, \code{scikit-learn} returns \eqn{0.5} instead. +} +\item \code{"aucpr"}: \href{https://en.wikipedia.org/wiki/Precision_and_recall}{Area under the PR curve}. +Available for classification and learning-to-rank tasks. + +After XGBoost 1.6, both of the requirements and restrictions for using \code{"aucpr"} in classification problem are similar to \code{"auc"}. For ranking task, only binary relevance label \eqn{y \in [0, 1]} is supported. Different from \code{"map"} (mean average precision), \code{"aucpr"} calculates the \emph{interpolated} area under precision recall curve using continuous interpolation. +\item \code{"pre"}: Precision at \eqn{k}. Supports only learning to rank task. +\item \code{"ndcg"}: \href{http://en.wikipedia.org/wiki/NDCG}{Normalized Discounted Cumulative Gain} +\item \code{"map"}: \href{http://en.wikipedia.org/wiki/Mean_average_precision#Mean_average_precision}{Mean Average Precision} + +The \verb{average precision} is defined as: + +\eqn{AP@l = \frac{1}{min{(l, N)}}\sum^l_{k=1}P@k \cdot I_{(k)}} + +where \eqn{I_{(k)}} is an indicator function that equals to \eqn{1} when the document at \eqn{k} is relevant and \eqn{0} otherwise. The \eqn{P@k} is the precision at \eqn{k}, and \eqn{N} is the total number of relevant documents. Lastly, the \verb{mean average precision} is defined as the weighted average across all queries. +\item \code{"ndcg@n"}, \code{"map@n"}, \code{"pre@n"}: \eqn{n} can be assigned as an integer to cut off the top positions in the lists for evaluation. +\item \code{"ndcg-"}, \code{"map-"}, \code{"ndcg@n-"}, \code{"map@n-"}: In XGBoost, the NDCG and MAP evaluate the score of a list without any positive samples as \eqn{1}. By appending "-" to the evaluation metric name, we can ask XGBoost to evaluate these scores as \eqn{0} to be consistent under some conditions. +\item \code{"poisson-nloglik"}: negative log-likelihood for Poisson regression +\item \code{"gamma-nloglik"}: negative log-likelihood for gamma regression +\item \code{"cox-nloglik"}: negative partial log-likelihood for Cox proportional hazards regression +\item \code{"gamma-deviance"}: residual deviance for gamma regression +\item \code{"tweedie-nloglik"}: negative log-likelihood for Tweedie regression (at a specified value of the \code{tweedie_variance_power} parameter) +\item \code{"aft-nloglik"}: Negative log likelihood of Accelerated Failure Time model. +See \href{https://xgboost.readthedocs.io/en/latest/tutorials/aft_survival_analysis.html}{Survival Analysis with Accelerated Failure Time} for details. +\item \code{"interval-regression-accuracy"}: Fraction of data points whose predicted labels fall in the interval-censored labels. +Only applicable for interval-censored data. See \href{https://xgboost.readthedocs.io/en/latest/tutorials/aft_survival_analysis.html}{Survival Analysis with Accelerated Failure Time} for details. +} +}} + \item{nthreads}{Number of parallel threads to use. If passing zero, will use all CPU threads.} \item{seed}{Seed to use for random number generation. If passing \code{NULL}, will draw a random number using R's PRNG system to use as seed.} +\item{base_margin}{Base margin used for boosting from existing model. + +If passing it, will start the gradient boosting procedure from the scores that are provided +here - for example, one can pass the raw scores from a previous model, or some per-observation +offset, or similar. + +Should be either a numeric vector or numeric matrix (for multi-class and multi-target objectives) +with the same number of rows as \code{x} and number of columns corresponding to number of optimization +targets, and should be in the untransformed scale (for example, for objective \code{binary:logistic}, +it should have log-odds, not probabilities; and for objective \code{multi:softprob}, should have +number of columns matching to number of classes in the data). + +Note that, if it contains more than one column, then columns will not be matched by name to +the corresponding \code{y} - \code{base_margin} should have the same column order that the model will use +(for example, for objective \code{multi:softprob}, columns of \code{base_margin} will be matched against +\code{levels(y)} by their position, regardless of what \code{colnames(base_margin)} returns). + +If \code{NULL}, will start from zero, but note that for most objectives, an intercept is usually +added (controllable through parameter \code{base_score} instead) when \code{base_margin} is not passed.} + \item{monotone_constraints}{Optional monotonicity constraints for features. Can be passed either as a named list (when \code{x} has column names), or as a vector. If passed @@ -173,6 +343,44 @@ interact with each other. See the tutorial \href{https://xgboost.readthedocs.io/en/stable/tutorials/feature_interaction_constraint.html}{Feature Interaction Constraints} for more information.} +\item{reg_alpha}{(alias: \code{reg_alpha}) +\itemize{ +\item L1 regularization term on weights. Increasing this value will make model more conservative. +\item For the linear booster, it's normalised to number of training examples. +\item default: 0 +\item range: \eqn{[0, \infty)} +}} + +\item{max_bin}{(for Tree Booster) (default=256) +\itemize{ +\item Only used if \code{tree_method} is set to \code{"hist"} or \code{"approx"}. +\item Maximum number of discrete bins to bucket continuous features. +\item Increasing this number improves the optimality of splits at the cost of higher computation time. +}} + +\item{max_leaves}{(for Tree Booster) (default=0) +Maximum number of nodes to be added. Not used by \code{"exact"} tree method.} + +\item{booster}{(default= \code{"gbtree"}) +Which booster to use. Can be \code{"gbtree"}, \code{"gblinear"} or \code{"dart"}; \code{"gbtree"} and \code{"dart"} use tree based models while \code{"gblinear"} uses linear functions.} + +\item{subsample}{(for Tree Booster) (default=1) +Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. and this will prevent overfitting. Subsampling will occur once in every boosting iteration. + +range: \eqn{(0,1]}} + +\item{sampling_method}{(for Tree Booster) (default= \code{"uniform"}) +The method to use to sample the training instances. +\itemize{ +\item \code{"uniform"}: each training instance has an equal probability of being selected. Typically set +\code{"subsample"} >= 0.5 for good results. +\item \code{"gradient_based"}: the selection probability for each training instance is proportional to the +\bold{regularized absolute value} of gradients (more specifically, \eqn{\sqrt{g^2+\lambda h^2}}). +\code{"subsample"} may be set to as low as 0.1 without loss of model accuracy. Note that this +sampling method is only supported when \code{"tree_method"} is set to \code{"hist"} and the device is \code{"cuda"}; other tree +methods only support \code{"uniform"} sampling. +}} + \item{feature_weights}{Feature weights for column sampling. Can be passed either as a vector with length matching to columns of \code{x}, or as a named @@ -181,25 +389,218 @@ named vector, will try to match the entries to column names of \code{x} by name. If \code{NULL} (the default), all columns will have the same weight.} -\item{base_margin}{Base margin used for boosting from existing model. +\item{colsample_bytree, colsample_bylevel, colsample_bynode}{(for Tree Booster) (default=1) +This is a family of parameters for subsampling of columns. +\itemize{ +\item All \code{"colsample_by*"} parameters have a range of \eqn{(0, 1]}, the default value of 1, and specify the fraction of columns to be subsampled. +\item \code{"colsample_bytree"} is the subsample ratio of columns when constructing each tree. Subsampling occurs once for every tree constructed. +\item \code{"colsample_bylevel"} is the subsample ratio of columns for each level. Subsampling occurs once for every new depth level reached in a tree. Columns are subsampled from the set of columns chosen for the current tree. +\item \code{"colsample_bynode"} is the subsample ratio of columns for each node (split). Subsampling occurs once every time a new split is evaluated. Columns are subsampled from the set of columns chosen for the current level. This is not supported by the exact tree method. +\item \code{"colsample_by*"} parameters work cumulatively. For instance, +the combination \verb{\{'colsample_bytree'=0.5, 'colsample_bylevel'=0.5, 'colsample_bynode'=0.5\}} with 64 features will leave 8 features to choose from at +each split. +} -If passing it, will start the gradient boosting procedure from the scores that are provided -here - for example, one can pass the raw scores from a previous model, or some per-observation -offset, or similar. +One can set the \code{"feature_weights"} for DMatrix to +define the probability of each feature being selected when using column sampling.} -Should be either a numeric vector or numeric matrix (for multi-class and multi-target objectives) -with the same number of rows as \code{x} and number of columns corresponding to number of optimization -targets, and should be in the untransformed scale (for example, for objective \code{binary:logistic}, -it should have log-odds, not probabilities; and for objective \code{multi:softprob}, should have -number of columns matching to number of classes in the data). +\item{tree_method}{(for Tree Booster) (default= \code{"auto"}) +The tree construction algorithm used in XGBoost. See description in the \href{http://arxiv.org/abs/1603.02754}{reference paper} and \href{https://xgboost.readthedocs.io/en/latest/treemethod.html}{Tree Methods}. -Note that, if it contains more than one column, then columns will not be matched by name to -the corresponding \code{y} - \code{base_margin} should have the same column order that the model will use -(for example, for objective \code{multi:softprob}, columns of \code{base_margin} will be matched against -\code{levels(y)} by their position, regardless of what \code{colnames(base_margin)} returns). +Choices: \code{"auto"}, \code{"exact"}, \code{"approx"}, \code{"hist"}, this is a combination of commonly +used updaters. For other updaters like \code{"refresh"}, set the parameter \code{updater} +directly. +\itemize{ +\item \code{"auto"}: Same as the \code{"hist"} tree method. +\item \code{"exact"}: Exact greedy algorithm. Enumerates all split candidates. +\item \code{"approx"}: Approximate greedy algorithm using quantile sketch and gradient histogram. +\item \code{"hist"}: Faster histogram optimized approximate greedy algorithm. +}} -If \code{NULL}, will start from zero, but note that for most objectives, an intercept is usually -added (controllable through parameter \code{base_score} instead) when \code{base_margin} is not passed.} +\item{max_delta_step}{(for Tree Booster) (default=0) +Maximum delta step we allow each leaf output to be. If the value is set to 0, it means there is no constraint. If it is set to a positive value, it can help making the update step more conservative. Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced. Set it to value of 1-10 might help control the update. + +range: \eqn{[0, \infty)}} + +\item{scale_pos_weight}{(for Tree Booster) (default=1) +Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: \verb{sum(negative instances) / sum(positive instances)}. See \href{https://xgboost.readthedocs.io/en/latest/tutorials/param_tuning.html}{Parameters Tuning} for more discussion. Also, see Higgs Kaggle competition demo for examples: \href{https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-train.R}{R}, \href{https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-numpy.py}{py1}, \href{https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-cv.py}{py2}, \href{https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cross_validation.py}{py3}.} + +\item{updater}{(for Linear Booster) (default= \code{"shotgun"}) +Choice of algorithm to fit linear model +\itemize{ +\item \code{"shotgun"}: Parallel coordinate descent algorithm based on shotgun algorithm. Uses 'hogwild' parallelism and therefore produces a nondeterministic solution on each run. +\item \code{"coord_descent"}: Ordinary coordinate descent algorithm. Also multithreaded but still produces a deterministic solution. When the \code{device} parameter is set to \code{"cuda"} or \code{"gpu"}, a GPU variant would be used. +}} + +\item{grow_policy}{(for Tree Booster) (default= \code{"depthwise"}) +\itemize{ +\item Controls a way new nodes are added to the tree. +\item Currently supported only if \code{tree_method} is set to \code{"hist"} or \code{"approx"}. +\item Choices: \code{"depthwise"}, \code{"lossguide"} +\itemize{ +\item \code{"depthwise"}: split at nodes closest to the root. +\item \code{"lossguide"}: split at nodes with highest loss change. +} +}} + +\item{num_parallel_tree}{(for Tree Booster) (default=1) +Number of parallel trees constructed during each iteration. This option is used to support boosted random forest.} + +\item{multi_strategy}{(for Tree Booster) (default = \code{"one_output_per_tree"}) +The strategy used for training multi-target models, including multi-target regression +and multi-class classification. See \href{https://xgboost.readthedocs.io/en/latest/tutorials/multioutput.html}{Multiple Outputs} for more information. +\itemize{ +\item \code{"one_output_per_tree"}: One model for each target. +\item \code{"multi_output_tree"}: Use multi-target trees. +} + +Version added: 2.0.0 + +Note: This parameter is working-in-progress.} + +\item{base_score}{\itemize{ +\item The initial prediction score of all instances, global bias +\item The parameter is automatically estimated for selected objectives before training. To +disable the estimation, specify a real number argument. +\item If \code{base_margin} is supplied, \code{base_score} will not be added. +\item For sufficient number of iterations, changing this value will not have too much effect. +}} + +\item{seed_per_iteration}{(default= \code{FALSE}) +Seed PRNG determnisticly via iterator number.} + +\item{device}{(default= \code{"cpu"}) +Device for XGBoost to run. User can set it to one of the following values: +\itemize{ +\item \code{"cpu"}: Use CPU. +\item \code{"cuda"}: Use a GPU (CUDA device). +\item \code{"cuda:"}: \verb{} is an integer that specifies the ordinal of the GPU (which GPU do you want to use if you have more than one devices). +\item \code{"gpu"}: Default GPU device selection from the list of available and supported devices. Only \code{"cuda"} devices are supported currently. +\item \code{"gpu:"}: Default GPU device selection from the list of available and supported devices. Only \code{"cuda"} devices are supported currently. +} + +For more information about GPU acceleration, see \href{https://xgboost.readthedocs.io/en/latest/gpu/index.html}{XGBoost GPU Support}. In distributed environments, ordinal selection is handled by distributed frameworks instead of XGBoost. As a result, using \code{"cuda:"} will result in an error. Use \code{"cuda"} instead. + +Version added: 2.0.0 + +Note: if XGBoost was installed from CRAN, it won't have GPU support enabled, thus only \code{"cpu"} will be available. +To get GPU support, the R package for XGBoost must be installed from source or from the GitHub releases - see +\href{https://xgboost.readthedocs.io/en/latest/install.html#r}{instructions}.} + +\item{disable_default_eval_metric}{(default= \code{FALSE}) +Flag to disable default metric. Set to 1 or \code{TRUE} to disable.} + +\item{use_rmm}{Whether to use RAPIDS Memory Manager (RMM) to allocate cache GPU +memory. The primary memory is always allocated on the RMM pool when XGBoost is built +(compiled) with the RMM plugin enabled. Valid values are \code{TRUE} and \code{FALSE}. See +\href{https://xgboost.readthedocs.io/en/latest/python/rmm-examples/index.html}{Using XGBoost with RAPIDS Memory Manager (RMM) plugin} for details.} + +\item{max_cached_hist_node}{(for Non-Exact Tree Methods) (default = 65536) +Maximum number of cached nodes for histogram. This can be used with the \code{"hist"} and the +\code{"approx"} tree methods. + +Version added: 2.0.0 +\itemize{ +\item For most of the cases this parameter should not be set except for growing deep +trees. After 3.0, this parameter affects GPU algorithms as well. +}} + +\item{extmem_single_page}{(for Non-Exact Tree Methods) (default = \code{FALSE}) +This parameter is only used for the \code{"hist"} tree method with \code{device="cuda"} and +\code{subsample != 1.0}. Before 3.0, pages were always concatenated. + +Version added: 3.0.0 + +Whether the GPU-based \code{"hist"} tree method should concatenate the training data into a +single batch instead of fetching data on-demand when external memory is used. For GPU +devices that don't support address translation services, external memory training is +expensive. This parameter can be used in combination with subsampling to reduce overall +memory usage without significant overhead. See \href{https://xgboost.readthedocs.io/en/latest/tutorials/external_memory.html}{Using XGBoost External Memory Version} for +more information.} + +\item{max_cat_to_onehot}{(for Non-Exact Tree Methods) +A threshold for deciding whether XGBoost should use one-hot encoding based split for +categorical data. When number of categories is lesser than the threshold then one-hot +encoding is chosen, otherwise the categories will be partitioned into children nodes. + +Version added: 1.6.0} + +\item{max_cat_threshold}{(for Non-Exact Tree Methods) +Maximum number of categories considered for each split. Used only by partition-based +splits for preventing over-fitting. + +Version added: 1.7.0} + +\item{sample_type}{(for Dart Booster) (default= \code{"uniform"}) +Type of sampling algorithm. +\itemize{ +\item \code{"uniform"}: dropped trees are selected uniformly. +\item \code{"weighted"}: dropped trees are selected in proportion to weight. +}} + +\item{normalize_type}{(for Dart Booster) (default= \code{"tree"}) +Type of normalization algorithm. +\itemize{ +\item \code{"tree"}: new trees have the same weight of each of dropped trees. +\itemize{ +\item Weight of new trees are \code{1 / (k + learning_rate)}. +\item Dropped trees are scaled by a factor of \code{k / (k + learning_rate)}. +} +\item \code{"forest"}: new trees have the same weight of sum of dropped trees (forest). +\itemize{ +\item Weight of new trees are \code{1 / (1 + learning_rate)}. +\item Dropped trees are scaled by a factor of \code{1 / (1 + learning_rate)}. +} +}} + +\item{rate_drop}{(for Dart Booster) (default=0.0) +Dropout rate (a fraction of previous trees to drop during the dropout). + +range: \eqn{[0.0, 1.0]}} + +\item{one_drop}{(for Dart Booster) (default=0) +When this flag is enabled, at least one tree is always dropped during the dropout (allows Binomial-plus-one or epsilon-dropout from the original DART paper).} + +\item{skip_drop}{(for Dart Booster) (default=0.0) +Probability of skipping the dropout procedure during a boosting iteration. +\itemize{ +\item If a dropout is skipped, new trees are added in the same manner as \code{"gbtree"}. +\item Note that non-zero \code{skip_drop} has higher priority than \code{rate_drop} or \code{one_drop}. +} + +range: \eqn{[0.0, 1.0]}} + +\item{feature_selector}{(for Linear Booster) (default= \code{"cyclic"}) +Feature selection and ordering method +\itemize{ +\item \code{"cyclic"}: Deterministic selection by cycling through features one at a time. +\item \code{"shuffle"}: Similar to \code{"cyclic"} but with random feature shuffling prior to each update. +\item \code{"random"}: A random (with replacement) coordinate selector. +\item \code{"greedy"}: Select coordinate with the greatest gradient magnitude. It has \code{O(num_feature^2)} complexity. It is fully deterministic. It allows restricting the selection to \code{top_k} features per group with the largest magnitude of univariate weight change, by setting the \code{top_k} parameter. Doing so would reduce the complexity to \code{O(num_feature*top_k)}. +\item \code{"thrifty"}: Thrifty, approximately-greedy feature selector. Prior to cyclic updates, reorders features in descending magnitude of their univariate weight changes. This operation is multithreaded and is a linear complexity approximation of the quadratic greedy selection. It allows restricting the selection to \code{top_k} features per group with the largest magnitude of univariate weight change, by setting the \code{top_k} parameter. +}} + +\item{top_k}{(for Linear Booster) (default=0) +The number of top features to select in \code{greedy} and \code{thrifty} feature selector. The value of 0 means using all the features.} + +\item{tweedie_variance_power}{(for Tweedie Regression (\code{"objective=reg:tweedie"})) (default=1.5) +\itemize{ +\item Parameter that controls the variance of the Tweedie distribution \code{var(y) ~ E(y)^tweedie_variance_power} +\item range: \eqn{(1,2)} +\item Set closer to 2 to shift towards a gamma distribution +\item Set closer to 1 to shift towards a Poisson distribution. +}} + +\item{huber_slope}{(for using Pseudo-Huber (\verb{"reg:pseudohubererror}")) (default = 1.0) +A parameter used for Pseudo-Huber loss to define the \eqn{\delta} term.} + +\item{quantile_alpha}{(for using Quantile Loss (\code{"reg:quantileerror"})) +A scalar or a list of targeted quantiles (passed as a numeric vector). + +Version added: 2.0.0} + +\item{aft_loss_distribution}{(for using AFT Survival Loss (\code{"survival:aft"}) and Negative Log Likelihood of AFT metric (\code{"aft-nloglik"})) +Probability Density Function, \code{"normal"}, \code{"logistic"}, or \code{"extreme"}.} \item{...}{Other training parameters. See the online documentation \href{https://xgboost.readthedocs.io/en/stable/parameter.html}{XGBoost Parameters} for @@ -228,6 +629,11 @@ possible functionalities of the core XGBoost library. See \code{\link[=xgb.train]{xgb.train()}} for a more flexible low-level alternative which is similar across different language bindings of XGBoost and which exposes the full library's functionalities. + +By default, most of the parameters here have a value of \code{NULL}, which signals XGBoost to use its +default value. Default values are automatically determined by the XGBoost core library, and are +subject to change over XGBoost library versions. Some of them might differ according to the +booster type (e.g. defaults for regularization are different for linear and tree-based boosters). } \details{ For package authors using 'xgboost' as a dependency, it is highly recommended to use From ec912968e6fcc42d2a6693cc3c57a41b9d056b47 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Mon, 16 Dec 2024 21:02:51 +0100 Subject: [PATCH 2/6] remove unused entry --- R-package/R/xgboost.R | 7 ------- R-package/man/xgboost.Rd | 8 -------- 2 files changed, 15 deletions(-) diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index fbf7cc4b01aa..4cafaa5a5fca 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -1049,13 +1049,6 @@ check.early.stopping.rounds <- function(early_stopping_rounds, eval_set) { #' Choice of algorithm to fit linear model #' - `"shotgun"`: Parallel coordinate descent algorithm based on shotgun algorithm. Uses 'hogwild' parallelism and therefore produces a nondeterministic solution on each run. #' - `"coord_descent"`: Ordinary coordinate descent algorithm. Also multithreaded but still produces a deterministic solution. When the `device` parameter is set to `"cuda"` or `"gpu"`, a GPU variant would be used. -#' @param ... Other training parameters. See the online documentation -#' [XGBoost Parameters](https://xgboost.readthedocs.io/en/stable/parameter.html) for -#' details about possible values and what they do. -#' -#' Note that not all possible values from the core XGBoost library are allowed as `params` for -#' 'xgboost()' - in particular, values which require an already-fitted booster object (such as -#' `process_type`) are not accepted here. #' @inheritParams xgb.params #' @return A model object, inheriting from both `xgboost` and `xgb.Booster`. Compared to the regular #' `xgb.Booster` model class produced by [xgb.train()], this `xgboost` class will have an diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index 61a1c290479d..39b513baa176 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -601,14 +601,6 @@ Version added: 2.0.0} \item{aft_loss_distribution}{(for using AFT Survival Loss (\code{"survival:aft"}) and Negative Log Likelihood of AFT metric (\code{"aft-nloglik"})) Probability Density Function, \code{"normal"}, \code{"logistic"}, or \code{"extreme"}.} - -\item{...}{Other training parameters. See the online documentation -\href{https://xgboost.readthedocs.io/en/stable/parameter.html}{XGBoost Parameters} for -details about possible values and what they do. - -Note that not all possible values from the core XGBoost library are allowed as \code{params} for -'xgboost()' - in particular, values which require an already-fitted booster object (such as -\code{process_type}) are not accepted here.} } \value{ A model object, inheriting from both \code{xgboost} and \code{xgb.Booster}. Compared to the regular From a467da0ea7c2c62d4adbae98d885b512ed5c35a5 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Wed, 18 Dec 2024 19:05:35 +0100 Subject: [PATCH 3/6] more details about what is not supported --- R-package/R/xgboost.R | 9 +++++++++ R-package/man/xgboost.Rd | 11 +++++++++++ 2 files changed, 20 insertions(+) diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index 4cafaa5a5fca..eaae2335bc7b 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -927,6 +927,15 @@ check.early.stopping.rounds <- function(early_stopping_rounds, eval_set) { #' - `"multi:softprob"`: multi-class classification throgh multinomial logistic likelihood. #' - `"reg:gamma"`: gamma regression with log-link. Output is a mean of gamma distribution. It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be [gamma-distributed](https://en.wikipedia.org/wiki/Gamma_distribution#Occurrence_and_applications). #' - `"reg:tweedie"`: Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be [Tweedie-distributed](https://en.wikipedia.org/wiki/Tweedie_distribution#Occurrence_and_applications). +#' +#' The following values are \bold{NOT} supported by `xgboost`, but are supported by [xgb.train()] +#' (see [xgb.params()] for details): +#' - `"reg:logistic"` +#' - `"binary:logitraw"` +#' - `"multi:softmax"` +#' - `"rank:ndcg"` +#' - `"rank:map"` +#' - `"rank:pairwise"` #' @param nrounds Number of boosting iterations / rounds. #' #' Note that the number of default boosting rounds here is not automatically tuned, and different diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index 39b513baa176..e77a608ab0cc 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -150,6 +150,17 @@ See \href{https://xgboost.readthedocs.io/en/latest/tutorials/aft_survival_analys \item \code{"multi:softprob"}: multi-class classification throgh multinomial logistic likelihood. \item \code{"reg:gamma"}: gamma regression with log-link. Output is a mean of gamma distribution. It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be \href{https://en.wikipedia.org/wiki/Gamma_distribution#Occurrence_and_applications}{gamma-distributed}. \item \code{"reg:tweedie"}: Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be \href{https://en.wikipedia.org/wiki/Tweedie_distribution#Occurrence_and_applications}{Tweedie-distributed}. +} + +The following values are \bold{NOT} supported by \code{xgboost}, but are supported by \code{\link[=xgb.train]{xgb.train()}} +(see \code{\link[=xgb.params]{xgb.params()}} for details): +\itemize{ +\item \code{"reg:logistic"} +\item \code{"binary:logitraw"} +\item \code{"multi:softmax"} +\item \code{"rank:ndcg"} +\item \code{"rank:map"} +\item \code{"rank:pairwise"} }} \item{nrounds}{Number of boosting iterations / rounds. From ddbb0758cfb0562b3b2b95048946dc34812251bd Mon Sep 17 00:00:00 2001 From: david-cortes Date: Thu, 19 Dec 2024 19:22:26 +0100 Subject: [PATCH 4/6] more links to parameters and online docs --- R-package/R/xgboost.R | 7 ++++++- R-package/man/xgboost.Rd | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index eaae2335bc7b..f51691893362 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -832,7 +832,9 @@ check.early.stopping.rounds <- function(early_stopping_rounds, eval_set) { #' Fits an XGBoost model (boosted decision tree ensemble) to given x/y data. #' #' See the tutorial [Introduction to Boosted Trees](https://xgboost.readthedocs.io/en/stable/tutorials/model.html) -#' for a longer explanation of what XGBoost does. +#' for a longer explanation of what XGBoost does, and the rest of the +#' [XGBoost Tutorials](https://xgboost.readthedocs.io/en/latest/tutorials/index.html) for further +#' explanations XGBoost's features and usage. #' #' This function is intended to provide a more user-friendly interface for XGBoost that follows #' R's conventions for model fitting and predictions, but which doesn't expose all of the @@ -845,6 +847,9 @@ check.early.stopping.rounds <- function(early_stopping_rounds, eval_set) { #' default value. Default values are automatically determined by the XGBoost core library, and are #' subject to change over XGBoost library versions. Some of them might differ according to the #' booster type (e.g. defaults for regularization are different for linear and tree-based boosters). +#' See [xgb.params()] and the [online documentation](https://xgboost.readthedocs.io/en/latest/parameter.html) +#' for more details about parameters - but note that some of the parameters are not supported in +#' the `xgboost()` interface. #' @details #' For package authors using 'xgboost' as a dependency, it is highly recommended to use #' [xgb.train()] in package code instead of [xgboost()], since it has a more stable interface diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index e77a608ab0cc..20520e757492 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -624,7 +624,9 @@ outputs, such as class names for classification problems. Fits an XGBoost model (boosted decision tree ensemble) to given x/y data. See the tutorial \href{https://xgboost.readthedocs.io/en/stable/tutorials/model.html}{Introduction to Boosted Trees} -for a longer explanation of what XGBoost does. +for a longer explanation of what XGBoost does, and the rest of the +\href{https://xgboost.readthedocs.io/en/latest/tutorials/index.html}{XGBoost Tutorials} for further +explanations XGBoost's features and usage. This function is intended to provide a more user-friendly interface for XGBoost that follows R's conventions for model fitting and predictions, but which doesn't expose all of the @@ -637,6 +639,9 @@ By default, most of the parameters here have a value of \code{NULL}, which signa default value. Default values are automatically determined by the XGBoost core library, and are subject to change over XGBoost library versions. Some of them might differ according to the booster type (e.g. defaults for regularization are different for linear and tree-based boosters). +See \code{\link[=xgb.params]{xgb.params()}} and the \href{https://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation} +for more details about parameters - but note that some of the parameters are not supported in +the \code{xgboost()} interface. } \details{ For package authors using 'xgboost' as a dependency, it is highly recommended to use From d36841f7ef83dd56ffb7ae1f4a6ac0142cd6970d Mon Sep 17 00:00:00 2001 From: david-cortes Date: Fri, 3 Jan 2025 17:02:49 +0100 Subject: [PATCH 5/6] fix linter for docs --- R-package/R/xgboost.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index f51691893362..338df9770015 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -825,6 +825,7 @@ check.early.stopping.rounds <- function(early_stopping_rounds, eval_set) { return(early_stopping_rounds) } +# nolint start: line_length_linter. #' Fit XGBoost Model #' #' @export @@ -1141,6 +1142,7 @@ xgboost <- function( quantile_alpha = NULL, aft_loss_distribution = NULL ) { +# nolint end params <- as.list(environment()) params <- params[ (names(params) %in% formalArgs(xgb.params)) From 46456cf292c2c5d59de301f6cfc0a24729f6a5e5 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Fri, 3 Jan 2025 17:05:20 +0100 Subject: [PATCH 6/6] correct default values for learning rate --- R-package/R/xgboost.R | 6 +++--- R-package/man/xgboost.Rd | 8 +++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index 338df9770015..b57cd78030ef 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -1042,10 +1042,10 @@ check.early.stopping.rounds <- function(early_stopping_rounds, eval_set) { #' Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger `min_split_loss` is, the more conservative the algorithm will be. Note that a tree where no splits were made might still contain a single terminal node with a non-zero score. #' #' range: \eqn{[0, \infty)} -#' @param learning_rate (default=0.3, alias: `eta`) +#' @param learning_rate (alias: `eta`) #' Step size shrinkage used in update to prevent overfitting. After each boosting step, we can directly get the weights of new features, and `learning_rate` shrinks the feature weights to make the boosting process more conservative. -#' -#' range: \eqn{[0,1]} +#' - range: \eqn{[0,1]} +#' - default value: 0.3 for tree-based boosters, 0.5 for linear booster. #' @param reg_lambda (alias: `lambda`) #' - For tree-based boosters: #' - L2 regularization term on weights. Increasing this value will make model more conservative. diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index 20520e757492..1c96b1ff498b 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -173,10 +173,12 @@ Maximum depth of a tree. Increasing this value will make the model more complex range: \eqn{[0, \infty)}} -\item{learning_rate}{(default=0.3, alias: \code{eta}) +\item{learning_rate}{(alias: \code{eta}) Step size shrinkage used in update to prevent overfitting. After each boosting step, we can directly get the weights of new features, and \code{learning_rate} shrinks the feature weights to make the boosting process more conservative. - -range: \eqn{[0,1]}} +\itemize{ +\item range: \eqn{[0,1]} +\item default value: 0.3 for tree-based boosters, 0.5 for linear booster. +}} \item{min_child_weight}{(for Tree Booster) (default=1) Minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than \code{min_child_weight}, then the building process will give up further partitioning. In linear regression task, this simply corresponds to minimum number of instances needed to be in each node. The larger \code{min_child_weight} is, the more conservative the algorithm will be.