From 607b89858af3ec592525cead7cd06911161fc08a Mon Sep 17 00:00:00 2001 From: monopteryx Date: Fri, 9 Aug 2024 17:44:38 +0300 Subject: [PATCH] I updated the code to avoid warnings --- DESCRIPTION | 7 +++--- Dockerfile | 5 +--- NAMESPACE | 6 +---- NEWS.md | 5 ++-- R/feature_selection.R | 45 +++++++++++++++++++++++++---------- R/wrapper_feature_selection.R | 10 +++++--- 6 files changed, 48 insertions(+), 30 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5422c53..6424137 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: FeatureSelection Type: Package Title: Feature extraction and selection based on 'glmnet', 'xgboost' and 'ranger' Version: 1.0.0 -Date: 2021-05-19 +Date: 2024-08-09 Authors@R: c( person(given = "Lampros", family = "Mouselimis", email = "mouselimislampros@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "https://orcid.org/0000-0002-8024-1546"))) Maintainer: Lampros Mouselimis BugReports: https://github.com/mlampros/FeatureSelection/issues @@ -12,7 +12,7 @@ Depends: R(>= 3.3.0) Imports: doParallel, - dplyr, + data.table, glmnet, ranger, xgboost, @@ -28,6 +28,5 @@ Suggests: covr SystemRequirements: update: apt-get -y update (deb) License: GPL-3 -LazyData: TRUE Encoding: UTF-8 -RoxygenNote: 7.1.1 +RoxygenNote: 7.3.0 diff --git a/Dockerfile b/Dockerfile index eb4b397..3b0a702 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,19 +1,16 @@ FROM rocker/rstudio:devel - LABEL maintainer='Lampros Mouselimis' - RUN export DEBIAN_FRONTEND=noninteractive; apt-get -y update && \ apt-get install -y make zlib1g-dev libssl-dev libcurl4-openssl-dev && \ apt-get install -y sudo && \ apt-get -y update && \ - R -e "install.packages(c( 'doParallel', 'dplyr', 'glmnet', 'ranger', 'xgboost', 'Matrix', 'magrittr', 'utils', 'stats', 'graphics', 'grDevices', 'rlang', 'testthat', 'covr', 'remotes' ), repos = 'https://cloud.r-project.org/' )" && \ + R -e "install.packages(c( 'doParallel', 'data.table', 'glmnet', 'ranger', 'xgboost', 'Matrix', 'magrittr', 'utils', 'stats', 'graphics', 'grDevices', 'rlang', 'testthat', 'covr', 'remotes' ), repos = 'https://cloud.r-project.org/' )" && \ R -e "remotes::install_github('mlampros/FeatureSelection', upgrade = 'always', dependencies = TRUE, repos = 'https://cloud.r-project.org/')" && \ apt-get autoremove -y && \ apt-get clean - ENV USER rstudio diff --git a/NAMESPACE b/NAMESPACE index b09cdc4..303be28 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,12 +12,8 @@ export(regr_folds) export(wrapper_feat_select) importFrom(Matrix,Matrix) importFrom(Matrix,colSums) +importFrom(data.table,as.data.table) importFrom(doParallel,registerDoParallel) -importFrom(dplyr,funs) -importFrom(dplyr,group_by) -importFrom(dplyr,n) -importFrom(dplyr,summarize) -importFrom(dplyr,summarize_each) importFrom(glmnet,cv.glmnet) importFrom(grDevices,dev.cur) importFrom(grDevices,dev.off) diff --git a/NEWS.md b/NEWS.md index f1403f0..5839725 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,10 +1,11 @@ ## FeatureSelection -* **18-05-2016**: I added tests and code-coverage +* **09-08-2024**: I replaced **dplyr** with **data.table** because the **dplyr** functions 'summarise_each_' and 'funs' gave deprecation warnings +* **19-05-2021**: I replaced **doMC** with **doParallel** because **doMC** does not work on both **Unix** and **Windows** OS (applies only to **'glmnet-lasso'** method if number of threads > 1) * **03-02-2020**: + Updated the R files so that *Feature Selection* works with the newest versions of the imported R packages + Adjusted the tests + Added Dockerfile and docker image + Updated the README.md and .travis.yml files -* **19-05-2021**: I replaced **doMC** with **doParallel** because **doMC** does not work on both **Unix** and **Windows** OS (applies only to **'glmnet-lasso'** method if number of threads > 1) +* **18-05-2016**: I added tests and code-coverage diff --git a/R/feature_selection.R b/R/feature_selection.R index 5736298..86d9728 100644 --- a/R/feature_selection.R +++ b/R/feature_selection.R @@ -1,6 +1,9 @@ utils::globalVariables(c("%>%", ".", + ".N", + ".SD", + "coefficients", "predict")) # Keep 'predict' as a global variable. It appears both in 'stats' and 'glmnet' however I can not specify 'predict.cv.glmnet' because the function does not appear in the >= 3.0.0 version of the package (I receive an error otherwise) @@ -32,7 +35,7 @@ utils::globalVariables(c("%>%", #' #' @export #' @importFrom glmnet cv.glmnet -#' @importFrom dplyr group_by summarize summarize_each funs n +#' @importFrom data.table as.data.table #' @importFrom doParallel registerDoParallel #' @importFrom xgboost xgb.DMatrix xgb.train xgb.importance #' @importFrom ranger ranger @@ -120,9 +123,18 @@ utils::globalVariables(c("%>%", #' } -feature_selection = function(X, y, method = NULL, params_glmnet = NULL, params_xgboost = NULL, params_ranger = NULL, xgb_sort = NULL, CV_folds = 5, stratified_regr = FALSE, - - scale_coefs_glmnet = FALSE, cores_glmnet = NULL, verbose = FALSE) { +feature_selection = function(X, + y, + method = NULL, + params_glmnet = NULL, + params_xgboost = NULL, + params_ranger = NULL, + xgb_sort = NULL, + CV_folds = 5, + stratified_regr = FALSE, + scale_coefs_glmnet = FALSE, + cores_glmnet = NULL, + verbose = FALSE) { if (is.null(method)) stop("use method = .. to select one of the available methods : xgboost, glmnet-lasso, ranger") if (CV_folds < 1) stop("CV_folds should be >= 1") @@ -376,9 +388,14 @@ feature_selection = function(X, y, method = NULL, params_glmnet = NULL, params_x else { - all_feat = data.frame(do.call('rbind', get_all_feat)) - - tbl_x = data.frame(all_feat %>% dplyr::group_by(.data$Feature) %>% dplyr::summarize(coefficients = mean(.data$coefficients, na.rm = TRUE), Frequency = dplyr::n())) # for ".data" see: https://community.rstudio.com/t/how-to-solve-no-visible-binding-for-global-variable-note/28887/3 + all_feat = data.frame(do.call('rbind', get_all_feat)) |> + data.table::as.data.table() + + tbl_x = all_feat[, .(coefficients = mean(coefficients, na.rm = TRUE), + Frequency = .N), + by = 'Feature'] |> + as.data.frame() + if (scale_coefs_glmnet) tbl_x[, 2] = abs(tbl_x[, 2]) tbl_x = tbl_x[order(tbl_x$Frequency, tbl_x$coefficients, decreasing = TRUE),] # the data.frame in 'glmnet-lasso' is sorted by Frequency (default) } @@ -498,9 +515,11 @@ feature_selection = function(X, y, method = NULL, params_glmnet = NULL, params_x gc() } - tbl_x = data.frame(do.call('rbind', get_all_feat)) + tbl_x = data.frame(do.call('rbind', get_all_feat)) |> + data.table::as.data.table() - tbl1 = data.frame(tbl_x %>% dplyr::group_by(.data$Feature) %>% dplyr::summarize_each(dplyr::funs(mean(., na.rm = TRUE)))) # for ".data" see: https://community.rstudio.com/t/how-to-solve-no-visible-binding-for-global-variable-note/28887/3 + tbl1 = tbl_x[, lapply(.SD, mean, na.rm = TRUE), by = 'Feature'] |> + as.data.frame() if (is.null(xgb_sort) || (xgb_sort == 'Frequency')) { @@ -636,9 +655,11 @@ feature_selection = function(X, y, method = NULL, params_glmnet = NULL, params_x gc() } - tbl_x = data.frame(do.call('rbind', get_all_feat)) - - tbl1 = data.frame(tbl_x %>% dplyr::group_by(.data$Feature) %>% dplyr::summarize_each(dplyr::funs(mean(., na.rm = TRUE)))) # for ".data" see: https://community.rstudio.com/t/how-to-solve-no-visible-binding-for-global-variable-note/28887/3 + tbl_x = data.frame(do.call('rbind', get_all_feat)) |> + data.table::as.data.table() + + tbl1 = tbl_x[, lapply(.SD, mean, na.rm = TRUE), by = 'Feature'] |> + as.data.frame() tbl1 = tbl1[order(tbl1[, 2], decreasing = TRUE), ] diff --git a/R/wrapper_feature_selection.R b/R/wrapper_feature_selection.R index 021302a..0c1a52d 100644 --- a/R/wrapper_feature_selection.R +++ b/R/wrapper_feature_selection.R @@ -22,7 +22,7 @@ #' Furthermore the user can limit the number of features using the keep_number_feat parameter of the params_feature list. #' #' @export -#' @importFrom dplyr group_by summarize n +#' @importFrom data.table as.data.table #' @importFrom magrittr %>% #' @importFrom rlang .data #' @@ -183,9 +183,13 @@ wrapper_feat_select = function(X, y, params_glmnet = NULL, params_xgboost = NULL modify_lst = lapply(out_union, function(x) data.frame(feature = x$features, rank = normalized(length(x$features):1))) - modify_lst1 = data.frame(do.call(rbind, modify_lst)) + modify_lst1 = data.frame(do.call(rbind, modify_lst)) |> + data.table::as.data.table() - tbl_x = data.frame(modify_lst1 %>% dplyr::group_by(.data$feature) %>% dplyr::summarize(importance = sum(rank, na.rm = TRUE), Frequency = dplyr::n())) + tbl_x = modify_lst1[, .(importance = sum(rank, na.rm = TRUE), + Frequency = .N), + by = 'feature'] |> + as.data.frame() tbl1 = tbl_x[order(tbl_x$importance, decreasing = TRUE), ]