diff --git a/DESCRIPTION b/DESCRIPTION index 073a2a8..72c4965 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ClusterR Type: Package Title: Gaussian Mixture Models, K-Means, Mini-Batch-Kmeans, K-Medoids and Affinity Propagation Clustering -Version: 1.3.0 -Date: 2023-01-21 +Version: 1.3.1 +Date: 2023-02-01 Authors@R: c( person(given = "Lampros", family = "Mouselimis", email = "mouselimislampros@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "https://orcid.org/0000-0002-8024-1546")), person(given = "Conrad", family = "Sanderson", role = "cph", comment = "Author of the C++ Armadillo library"), person(given = "Ryan", family = "Curtin", role = "cph", comment = "Author of the C++ Armadillo library"), person(given = "Siddharth", family = "Agrawal", role = "cph", comment = "Author of the C code of the Mini-Batch-Kmeans algorithm (https://github.com/siddharth-agrawal/Mini-Batch-K-Means)"), person(given = "Brendan", family = "Frey", email = "frey@psi.toronto.edu", role = "cph", comment = "Author of the matlab code of the Affinity propagation algorithm (for commercial use please contact the author of the matlab code)"), person(given = "Delbert", family = "Dueck", role = "cph", comment = "Author of the matlab code of the Affinity propagation algorithm"), person(given = "Vitalie", family = "Spinu", email = "spinuvit@gmail.com", role = "ctb", comment = c(Github = "Github Contributor")) ) Maintainer: Lampros Mouselimis BugReports: https://github.com/mlampros/ClusterR/issues diff --git a/NEWS.md b/NEWS.md index a87b250..7d76dc7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,9 @@ +## Cluster 1.3.1 + +* I fixed a mistake related to a potential warning of the *'Optimal_Clusters_GMM()'* function (see issue: https://github.com/mlampros/ClusterR/issues/45) + + ## Cluster 1.3.0 * I updated the documentation of the *'Optimal_Clusters_KMeans()'* function related to the *'silhouette'* metric (see issue: https://github.com/mlampros/ClusterR/issues/42) diff --git a/R/clustering_functions.R b/R/clustering_functions.R index 2844de9..cad95b3 100644 --- a/R/clustering_functions.R +++ b/R/clustering_functions.R @@ -55,8 +55,16 @@ tryCatch_GMM <- function(data, gaussian_comps, dist_mode, seed_mode, km_iter, em #' dat = center_scale(dat) #' #' gmm = GMM(dat, 2, "maha_dist", "random_subset", 10, 10) -#' -GMM = function(data, gaussian_comps = 1, dist_mode = 'eucl_dist', seed_mode = 'random_subset', km_iter = 10, em_iter = 5, verbose = FALSE, var_floor = 1e-10, seed = 1) { + +GMM = function(data, + gaussian_comps = 1, + dist_mode = 'eucl_dist', + seed_mode = 'random_subset', + km_iter = 10, + em_iter = 5, + verbose = FALSE, + var_floor = 1e-10, + seed = 1) { if ('data.frame' %in% class(data)) data = as.matrix(data) if (!inherits(data, 'matrix')) stop('data should be either a matrix or a data frame') @@ -224,9 +232,17 @@ tryCatch_optimal_clust_GMM <- function(data, max_clusters, dist_mode, seed_mode, #' opt_gmm = Optimal_Clusters_GMM(dat, search_space, criterion = "AIC", plot_data = FALSE) #' - -Optimal_Clusters_GMM = function(data, max_clusters, criterion = "AIC", dist_mode = 'eucl_dist', seed_mode = 'random_subset', - km_iter = 10, em_iter = 5, verbose = FALSE, var_floor = 1e-10, plot_data = TRUE, seed = 1) { +Optimal_Clusters_GMM = function(data, + max_clusters, + criterion = "AIC", + dist_mode = 'eucl_dist', + seed_mode = 'random_subset', + km_iter = 10, + em_iter = 5, + verbose = FALSE, + var_floor = 1e-10, + plot_data = TRUE, + seed = 1) { if ('data.frame' %in% class(data)) data = as.matrix(data) if (!inherits(data, 'matrix')) stop('data should be either a matrix or a data frame') @@ -245,14 +261,19 @@ Optimal_Clusters_GMM = function(data, max_clusters, criterion = "AIC", dist_mode if (length(max_clusters) != 1) { plot_data = FALSE # set "plot_data" to FALSE if the "max_clusters" parameter is not of length 1 - if (ncol(data) < max(max_clusters) && verbose) { warning("the number of columns of the data should be larger than the maximum value of 'max_clusters'", call. = F); cat(" ", '\n') } + if (nrow(data) < max(max_clusters) && verbose) { + warning("the number of rows of the data should be larger than the maximum value of 'max_clusters'", call. = F) + cat(" ", '\n') + } } else { - if (ncol(data) < max_clusters && verbose) { warning("the number of columns of the data should be larger than 'max_clusters'", call. = F); cat(" ", '\n') } + if (nrow(data) < max_clusters && verbose) { + warning("the number of rows of the data should be larger than 'max_clusters'", call. = F) + cat(" ", '\n') + } } flag_non_finite = check_NaN_Inf(data) - if (!flag_non_finite) stop("the data includes NaN's or +/- Inf values") if (length(max_clusters) == 1) { @@ -267,52 +288,35 @@ Optimal_Clusters_GMM = function(data, max_clusters, criterion = "AIC", dist_mode gmm = tryCatch_optimal_clust_GMM(data, pass_vector, dist_mode, seed_mode, km_iter, em_iter, verbose, var_floor, criterion, seed) if ('Error' %in% names(gmm)) { - return(gmm) - - } else { - + } + else { if (plot_data) { - if (grDevices::dev.cur() != 1) { - grDevices::dev.off() # reset par() } vec_out = as.vector(gmm) - tmp_VAL = as.vector(stats::na.omit(vec_out)) if (length(which(is.na(vec_out))) > 0) { - x_dis = (1:length(vec_out))[-which(is.na(vec_out))] - - y_dis = vec_out[-which(is.na(vec_out))]} - + y_dis = vec_out[-which(is.na(vec_out))] + } else { - x_dis = 1:length(vec_out) - y_dis = vec_out } y_MAX = max(tmp_VAL) - graphics::plot(x = x_dis, y = y_dis, type = 'l', xlab = 'clusters', ylab = criterion, col = 'blue', lty = 3, axes = FALSE) - graphics::axis(1, at = seq(1, length(vec_out) , by = 1)) - graphics::axis(2, at = seq(round(min(tmp_VAL) - round(summary(y_MAX)[['Max.']]) / 10), y_MAX + round(summary(y_MAX)[['Max.']]) / 10, by = round((summary(tmp_VAL)['Max.'] - summary(tmp_VAL)['Min.']) / 5)), las = 1, cex.axis = 0.8) - - graphics::abline(h = seq(round(min(tmp_VAL) - round(summary(y_MAX)[['Max.']]) / 10), y_MAX + round(summary(y_MAX)[['Max.']]) / 10, by = round((summary(tmp_VAL)['Max.'] - summary(tmp_VAL)['Min.']) / 5)), v = seq(1, length(vec_out) , by = 1), - - col = "gray", lty = 3) - + graphics::abline(h = seq(round(min(tmp_VAL) - round(summary(y_MAX)[['Max.']]) / 10), y_MAX + round(summary(y_MAX)[['Max.']]) / 10, by = round((summary(tmp_VAL)['Max.'] - summary(tmp_VAL)['Min.']) / 5)), v = seq(1, length(vec_out) , by = 1), col = "gray", lty = 3) graphics::text(x = 1:length(vec_out), y = vec_out, labels = round(vec_out, 1), cex = 0.8, font = 2) } res = as.vector(gmm) - return(res) } } @@ -375,7 +379,13 @@ tryCatch_KMEANS_arma <- function(data, clusters, n_iter, verbose, seed_mode, CEN #' #' km = KMeans_arma(dat, clusters = 2, n_iter = 10, "random_subset") #' -KMeans_arma = function(data, clusters, n_iter = 10, seed_mode = "random_subset", verbose = FALSE, CENTROIDS = NULL, seed = 1) { +KMeans_arma = function(data, + clusters, + n_iter = 10, + seed_mode = "random_subset", + verbose = FALSE, + CENTROIDS = NULL, + seed = 1) { if ('data.frame' %in% class(data)) data = as.matrix(data) if (!inherits(data, 'matrix')) stop('data should be either a matrix or a data frame') @@ -390,7 +400,6 @@ KMeans_arma = function(data, clusters, n_iter = 10, seed_mode = "random_subset", stop('CENTROIDS should be a matrix with number of rows equal to the number of clusters and number of columns equal to the number of columns of the data') flag_non_finite = check_NaN_Inf(data) - if (!flag_non_finite) stop("the data includes NaN's or +/- Inf values") res = tryCatch_KMEANS_arma(data, clusters, n_iter, verbose, seed_mode, CENTROIDS, seed) @@ -456,9 +465,17 @@ KMeans_arma = function(data, clusters, n_iter = 10, seed_mode = "random_subset", #' #' km = KMeans_rcpp(dat, clusters = 2, num_init = 5, max_iters = 100, initializer = 'kmeans++') #' -KMeans_rcpp = function(data, clusters, num_init = 1, max_iters = 100, initializer = 'kmeans++', fuzzy = FALSE, - - verbose = FALSE, CENTROIDS = NULL, tol = 1e-4, tol_optimal_init = 0.3, seed = 1) { +KMeans_rcpp = function(data, + clusters, + num_init = 1, + max_iters = 100, + initializer = 'kmeans++', + fuzzy = FALSE, + verbose = FALSE, + CENTROIDS = NULL, + tol = 1e-4, + tol_optimal_init = 0.3, + seed = 1) { if ('data.frame' %in% class(data)) data = as.matrix(data) if (!inherits(data, 'matrix')) stop('data should be either a matrix or a data frame') @@ -474,7 +491,6 @@ KMeans_rcpp = function(data, clusters, num_init = 1, max_iters = 100, initialize if (tol_optimal_init <= 0.0) stop('tol_optimal_init should be a float number greater than 0.0') flag_non_finite = check_NaN_Inf(data) - if (!flag_non_finite) stop("the data includes NaN's or +/- Inf values") res = KMEANS_rcpp(data, clusters, num_init, max_iters, initializer, fuzzy, verbose, CENTROIDS, tol, eps = 1.0e-6, tol_optimal_init, seed) @@ -722,11 +738,19 @@ silhouette_of_clusters = function(data, clusters) { #' -Optimal_Clusters_KMeans = function(data, max_clusters, criterion = "variance_explained", fK_threshold = 0.85, num_init = 1, max_iters = 200, - - initializer = 'kmeans++', tol = 1e-4, plot_clusters = TRUE, verbose = FALSE, tol_optimal_init = 0.3, - - seed = 1, mini_batch_params = NULL) { +Optimal_Clusters_KMeans = function(data, + max_clusters, + criterion = "variance_explained", + fK_threshold = 0.85, + num_init = 1, + max_iters = 200, + initializer = 'kmeans++', + tol = 1e-4, + plot_clusters = TRUE, + verbose = FALSE, + tol_optimal_init = 0.3, + seed = 1, + mini_batch_params = NULL) { if ('data.frame' %in% class(data)) data = as.matrix(data) if (!inherits(data, 'matrix')) stop('data should be either a matrix or a data frame') @@ -758,7 +782,6 @@ Optimal_Clusters_KMeans = function(data, max_clusters, criterion = "variance_exp if (length(max_clusters) != 1) plot_clusters = FALSE # set "plot_clusters" to FALSE if the "max_clusters" parameter is not of length 1 flag_non_finite = check_NaN_Inf(data) - if (!flag_non_finite) stop("the data includes NaN's or +/- Inf values") LEN_CLUST = ITER_CLUST = NA @@ -771,22 +794,18 @@ Optimal_Clusters_KMeans = function(data, max_clusters, criterion = "variance_exp } vec_out = rep(NA, LEN_CLUST) - if (verbose) { cat("", '\n'); pb = utils::txtProgressBar(min = 1, max = LEN_CLUST, style = 3); cat("", '\n') } COUNT = 1 for (i in ITER_CLUST) { if (is.null(mini_batch_params)) { - km = KMEANS_rcpp(data, i, num_init, max_iters, initializer, FALSE, FALSE, NULL, tol, 1.0e-6, tol_optimal_init, seed) } else { - - km = MiniBatchKmeans(data, i, mini_batch_params[["batch_size"]], num_init, max_iters, mini_batch_params[["init_fraction"]], initializer, - - mini_batch_params[["early_stop_iter"]], FALSE, NULL, tol, tol_optimal_init, seed) + km = MiniBatchKmeans(data, i, mini_batch_params[["batch_size"]], num_init, max_iters, mini_batch_params[["init_fraction"]], + initializer, mini_batch_params[["early_stop_iter"]], FALSE, NULL, tol, tol_optimal_init, seed) tmp_cent = km$centroids km["centroids"] = NULL @@ -795,27 +814,21 @@ Optimal_Clusters_KMeans = function(data, max_clusters, criterion = "variance_exp if (criterion %in% c("dissimilarity", "silhouette", "BIC")) { # in these cases call also the 'predict_MBatchKMeans' function to receive the clusters km_preds = predict_MBatchKMeans(data, tmp_cent, FALSE) - km[["clusters"]] = as.vector(km_preds) } } if (criterion == "variance_explained") { - vec_out[COUNT] = sum(stats::na.omit(as.vector(km$WCSS_per_cluster))) / km$total_SSE } if (criterion == "WCSSE") { - vec_out[COUNT] = sum(stats::na.omit(as.vector(km$WCSS_per_cluster))) } if (criterion == "dissimilarity") { - eval_km = evaluation_rcpp(data, as.vector(km$clusters), FALSE) - tmp_dis = mean(stats::na.omit(unlist(lapply(eval_km$INTRA_cluster_dissimilarity, mean)))) - vec_out[COUNT] = tmp_dis } @@ -831,41 +844,29 @@ Optimal_Clusters_KMeans = function(data, max_clusters, criterion = "variance_exp } if (criterion == "distortion_fK") { - vec_out[COUNT] = sum(stats::na.omit(as.vector(km$WCSS_per_cluster))) } if (criterion == "AIC") { # http://stackoverflow.com/questions/15839774/how-to-calculate-bic-for-k-means-clustering-in-r - m = ncol(km$centers) - k = nrow(km$centers) - D = sum(stats::na.omit(km$WCSS_per_cluster)) - vec_out[COUNT] = D + 2.0 * m * k } if (criterion == "BIC") { # http://stackoverflow.com/questions/15839774/how-to-calculate-bic-for-k-means-clustering-in-r - m = ncol(km$centers) - k = nrow(km$centers) - n = length(km$clusters) - D = sum(stats::na.omit(km$WCSS_per_cluster)) - vec_out[COUNT] = D + log(n) * m * k } if (criterion == 'Adjusted_Rsquared') { - vec_out[COUNT] = sum(stats::na.omit(km$WCSS_per_cluster)) } if (verbose) { utils::setTxtProgressBar(pb, COUNT) } - COUNT = COUNT + 1 } @@ -884,96 +885,67 @@ Optimal_Clusters_KMeans = function(data, max_clusters, criterion = "variance_exp if (criterion %in% c('variance_explained', 'WCSSE', 'dissimilarity', 'silhouette', 'AIC', 'BIC', 'Adjusted_Rsquared')) { if (plot_clusters) { - tmp_VAL = as.vector(stats::na.omit(vec_out)) if (length(which(is.na(vec_out))) > 0) { - x_dis = (1:length(vec_out))[-which(is.na(vec_out))] - - y_dis = vec_out[-which(is.na(vec_out))]} - + y_dis = vec_out[-which(is.na(vec_out))] + } else { - x_dis = 1:length(vec_out) - y_dis = vec_out } y_MAX = max(tmp_VAL) - graphics::plot(x = x_dis, y = y_dis, type = 'l', xlab = 'clusters', ylab = criterion, col = 'blue', lty = 3, axes = FALSE) - graphics::axis(1, at = seq(1, length(vec_out) , by = 1)) if (criterion == 'silhouette') { - graphics::axis(2, at = seq(0, y_MAX + 0.05, by = 0.05 ), las = 1, cex.axis = 0.8) - - graphics::abline(h = seq(0.0, max(as.vector(stats::na.omit(vec_out))), 0.05), v = seq(1, length(vec_out) , by = 1), col = "gray", lty = 3)} - + graphics::abline(h = seq(0.0, max(as.vector(stats::na.omit(vec_out))), 0.05), v = seq(1, length(vec_out) , by = 1), col = "gray", lty = 3) + } else { - tmp_summary = round(summary(y_MAX)[['Max.']]) - out_max_summary = ifelse(tmp_summary == 0, 1, tmp_summary) - graphics::axis(2, at = seq(0, y_MAX + out_max_summary / 10, by = out_max_summary / 10), las = 1, cex.axis = 0.8) - graphics::abline(h = seq(0.0, max(as.vector(stats::na.omit(vec_out))), out_max_summary / 10), v = seq(1, length(vec_out) , by = 1), col = "gray", lty = 3) } if (criterion %in% c("variance_explained", "Adjusted_Rsquared", "dissimilarity", "silhouette")) { - - graphics::text(x = 1:length(vec_out), y = vec_out, labels = round(vec_out, 2), cex = 0.8, font = 2) } - + graphics::text(x = 1:length(vec_out), y = vec_out, labels = round(vec_out, 2), cex = 0.8, font = 2) + } else { - graphics::text(x = 1:length(vec_out), y = vec_out, labels = round(vec_out, 1), cex = 0.8, font = 2) } } } else { # "distortion_fK" - if (length(max_clusters) != 1) { fK_vec = "The 'distortion_fK' criterion can not be computed if the length of the 'max_clusters' parameter is greater than 1. See the details for more information!" } else { f_K = opt_clust_fK(vec_out, ncol(data), fK_threshold) - fK_vec = as.vector(f_K$fK_evaluation) } if (plot_clusters) { - if (length(which(is.na(fK_vec))) > 0) { - x_fk = (1:length(fK_vec))[-which(is.na(fK_vec))] - - y_fk = fK_vec[-which(is.na(fK_vec))]} - + y_fk = fK_vec[-which(is.na(fK_vec))] + } else { - x_fk = 1:length(fK_vec) - y_fk = fK_vec } graphics::par(oma = c(0, 2, 0, 0)) - graphics::plot(y_fk, type = 'l', xlab = 'clusters', ylab = 'f(K)', col = 'green', axes = FALSE) - graphics::axis(1, at = x_fk) - graphics::axis(2, at = seq(0, max(y_fk) + 0.1, by = round(summary(y_fk)[['Max.']]) / 10), las = 1, cex.axis = 0.8) - graphics::abline(h = seq(0.0, max(y_fk), round(summary(y_fk)[['Max.']]) / 10), v = seq(1, length(y_fk) , by = 1), col = "gray", lty = 3) - graphics::abline(h = fK_threshold, col = 'blue', lty = 3) - graphics::mtext("threshold", side = 2, line = 2, at = fK_threshold, las = 1, cex = 0.9) - graphics::text(x = x_fk, y = y_fk, labels = round(y_fk,2), cex = 0.8, font = 2) } } @@ -1033,9 +1005,19 @@ Optimal_Clusters_KMeans = function(data, max_clusters, criterion = "variance_exp #' -MiniBatchKmeans = function(data, clusters, batch_size = 10, num_init = 1, max_iters = 100, init_fraction = 1.0, initializer = 'kmeans++', - - early_stop_iter = 10, verbose = FALSE, CENTROIDS = NULL, tol = 1e-4, tol_optimal_init = 0.3, seed = 1) { +MiniBatchKmeans = function(data, + clusters, + batch_size = 10, + num_init = 1, + max_iters = 100, + init_fraction = 1.0, + initializer = 'kmeans++', + early_stop_iter = 10, + verbose = FALSE, + CENTROIDS = NULL, + tol = 1e-4, + tol_optimal_init = 0.3, + seed = 1) { if ('data.frame' %in% class(data)) data = as.matrix(data) if (!inherits(data, 'matrix')) stop('data should be either a matrix or a data frame') @@ -1053,7 +1035,6 @@ MiniBatchKmeans = function(data, clusters, batch_size = 10, num_init = 1, max_it if (tol_optimal_init <= 0.0) stop('tol_optimal_init should be a float number greater than 0.0') flag_non_finite = check_NaN_Inf(data) - if (!flag_non_finite) stop("the data includes NaN's or +/- Inf values") res = mini_batch_kmeans(data, clusters, batch_size, max_iters, num_init, init_fraction, initializer, early_stop_iter, verbose, CENTROIDS, tol, tol_optimal_init, seed) @@ -1098,19 +1079,15 @@ predict_MBatchKMeans = function(data, CENTROIDS, fuzzy = FALSE) { if (!is.logical(fuzzy)) stop('fuzzy should be either TRUE or FALSE') flag_non_finite = check_NaN_Inf(data) - if (!flag_non_finite) stop("the data includes NaN's or +/- Inf values") res = Predict_mini_batch_kmeans(data, CENTROIDS, fuzzy, eps = 1.0e-6) if (fuzzy) { - - return(structure(list(clusters = as.vector(res$clusters + 1), fuzzy_clusters = res$fuzzy_clusters), class = "k-means clustering"))} - + return(structure(list(clusters = as.vector(res$clusters + 1), fuzzy_clusters = res$fuzzy_clusters), class = "k-means clustering")) + } else { - tmp_res = as.vector(res$clusters + 1) - return(tmp_res) } } @@ -2217,52 +2194,42 @@ external_validation = function(true_labels, clusters, method = "adjusted_rand_in # } if (method == "rand_index") { # http://stats.stackexchange.com/questions/89030/rand-index-calculation - return((tp + tn) / (tp + fp + fn + tn)) } if (method == "adjusted_rand_index") { - return((tp - prod_comb) / (mean_comb - prod_comb)) # https://github.com/scikit-learn/scikit-learn/blob/51a765a/sklearn/metrics/cluster/supervised.py#L90 } if (method == "jaccard_index") { - return(tp / (tp + fp + fn)) # http://www.cs.ucsb.edu/~veronika/MAE/wagner07comparingclusterings.pdf } if (method == "fowlkes_mallows_index") { - return(sqrt((tp / ((tp + fp))) * (tp / (tp + fn)))) # https://en.wikipedia.org/wiki/Fowlkes%E2%80%93Mallows_index } if (method == "mirkin_metric") { - return(2.0 * (fp + fn)) # http://www.cs.ucsb.edu/~veronika/MAE/wagner07comparingclusterings.pdf } if (method == 'purity') { # http://bioinformatics.oxfordjournals.org/content/23/12/1495.full.pdf+html [ page 1498 ] - return(res_purity) } if (method == 'entropy') { # http://bioinformatics.oxfordjournals.org/content/23/12/1495.full.pdf+html [ page 1498 ] - return(res_entropy) } if (method == 'nmi') { # http://nlp.stanford.edu/IR-book/html/htmledition/evaluation-of-clustering-1.html, http://stackoverflow.com/questions/35709562/how-to-calculate-clustering-entropy-a-working-example-or-software-code - return(NMI) } if (method == 'var_info') { # http://www.stat.washington.edu/mmp/Papers/compare-colt.pdf, http://www.cs.ucsb.edu/~veronika/MAE/wagner07comparingclusterings.pdf - return(VAR_INFO) } if (method == 'nvi') { # http://jmlr.csail.mit.edu/papers/volume11/vinh10a/vinh10a.pdf - return(NVI) } } diff --git a/inst/include/ClusterRHeader.h b/inst/include/ClusterRHeader.h index 72acceb..efb8517 100644 --- a/inst/include/ClusterRHeader.h +++ b/inst/include/ClusterRHeader.h @@ -118,9 +118,7 @@ namespace clustR { void set_seed(int seed) { Rcpp::Environment base_env("package:base"); - Rcpp::Function set_seed_r = base_env["set.seed"]; - set_seed_r(seed); } @@ -132,13 +130,10 @@ namespace clustR { arma::rowvec sample_vec(int num_elem, int start, int end, bool replace) { if (replace) { - - return(arma::randi( num_elem, arma::distr_param(start,end) ));} - + return(arma::randi( num_elem, arma::distr_param(start,end) )); + } else { - arma::rowvec tmp = arma::conv_to< arma::rowvec >::from(arma::shuffle(arma::regspace(start,end))); - return(tmp.subvec(0,num_elem-1)); } } @@ -148,7 +143,6 @@ namespace clustR { // early-stopping criterium double squared_norm(arma::mat x) { - return std::sqrt(arma::accu(arma::pow(x, 2))); } @@ -160,15 +154,12 @@ namespace clustR { int MinMat(arma::vec x) { double out = arma::datum::inf; - int idx = 0; for (unsigned int i = 0; i < x.n_elem; i++) { if (x(i) < out) { - out = x(i); - idx = i; } } @@ -187,7 +178,6 @@ namespace clustR { arma::vec tmp_c(centroids.n_rows); for (unsigned int i = 0; i < centroids.n_rows; i++) { - tmp_c(i) = arma::as_scalar(arma::accu(arma::pow(vec - centroids.row(i), 2))); } @@ -234,7 +224,6 @@ namespace clustR { // double kmeans_pp_dist(arma::rowvec vec, arma::rowvec centroid) { - return arma::as_scalar(arma::accu(arma::pow(vec - centroid, 2))); } @@ -397,11 +386,8 @@ namespace clustR { arma::rowvec norm_fuzzy(arma::rowvec vec, double eps) { vec = arma::abs(vec); - vec += eps; - arma::rowvec d = arma::accu(vec) / vec; - return d / arma::accu(d); } @@ -414,15 +400,11 @@ namespace clustR { Rcpp::NumericVector quantile_value(arma::rowvec x, int clusters) { arma::vec IDX = arma::regspace(0.0, arma::max(x) / (clusters - 1), 1.0); - Rcpp::Environment stats("package:stats"); - Rcpp::Function quantile = stats["quantile"]; - Rcpp::NumericVector ans(IDX.n_elem); for(unsigned int i = 0; i < IDX.n_elem; i++){ - ans[i] = Rcpp::as(quantile(x, IDX(i))); } @@ -438,18 +420,15 @@ namespace clustR { std::map counts; for (unsigned int i = 0; i < x.n_elem; i++) { - counts[x(i)]++; } arma::rowvec out_vec(counts.size()); - int iter = 0; for (auto& kv : counts) { out_vec(iter) = kv.second; - iter++; } @@ -465,48 +444,33 @@ namespace clustR { arma::uvec quantile_init_rcpp(arma::mat data, int sample_rows, int clusters) { arma::rowvec tmp_idx = sample_vec(sample_rows, 0, data.n_rows - 1, false); - arma::rowvec vec(sample_rows, arma::fill::zeros); for (int i = 0; i < sample_rows; i++) { int tmp_val = floorf(std::abs(arma::accu(data.row(tmp_idx(0)) - data.row(tmp_idx(i)))) * 10000000.0) / 10000000; // use floorf with a float number - vec(i) = tmp_val; } Rcpp::NumericVector tmp_rcpp = Rcpp::as(Rcpp::wrap(vec)); - arma::rowvec out = Rcpp::as(Rcpp::duplicated(tmp_rcpp)); - arma::uvec tmp_out = arma::find(out == 0); - arma::rowvec vec1 = arma::conv_to< arma::rowvec >::from(vec(tmp_out)); - arma::uvec vec_sort_idx = arma::sort_index(vec1); - arma::rowvec vec_sorted = arma::conv_to< arma::rowvec >::from(vec1(vec_sort_idx)); - arma::rowvec idx1 = arma::conv_to< arma::rowvec >::from(tmp_idx(tmp_out)); - arma::rowvec idx1_sorted = arma::conv_to< arma::rowvec >::from(idx1(vec_sort_idx)); - arma::rowvec vec_cumsum = arma::cumsum(vec_sorted) / arma::accu(vec_sorted); - arma::rowvec quant = Rcpp::as (quantile_value(vec_cumsum, clusters)); - arma::uvec idx_out(clusters, arma::fill::zeros); // medoids for (unsigned int j = 0; j < quant.n_elem; j++) { if (j == 0) { - - idx_out(j) = idx1_sorted(0);} - + idx_out(j) = idx1_sorted(0); + } else { - arma::uvec tmp_find = arma::find(vec_cumsum <= quant[j]); - idx_out(j) = idx1_sorted(tmp_find(tmp_find.n_elem - 1)); } } @@ -526,17 +490,11 @@ namespace clustR { arma::vec check_medoids(arma::mat data, int clust, double tol = 0.5) { arma::vec medoids_out(clust); // medoids - medoids_out.fill(arma::datum::nan); - arma::mat medoids_dat(clust, data.n_cols); - arma::uvec idx = arma::regspace(0, 1, data.n_rows - 1); - arma::uvec shufl_idx = arma::shuffle(idx); - arma::mat shufl_dat = data.rows(shufl_idx); - int count_m = 0; for (unsigned int i = 0; i < shufl_dat.n_rows; i++) { @@ -544,13 +502,10 @@ namespace clustR { if (i == 0) { medoids_dat.row(count_m) = arma::conv_to< arma::rowvec >::from(shufl_dat.row(i)); - medoids_out(count_m) = shufl_idx(i); - count_m += 1; if (count_m == clust) { - break; } } @@ -558,14 +513,11 @@ namespace clustR { else { if (count_m == clust) { - break; } unsigned int count_diff = 0; - arma::mat sub_mat = medoids_dat.submat(0, 0, count_m - 1, medoids_dat.n_cols - 1); - arma::rowvec pot_row = arma::conv_to< arma::rowvec >::from(shufl_dat.row(i)); for (unsigned int j = 0; j < sub_mat.n_rows; j++) { @@ -573,7 +525,6 @@ namespace clustR { double tmp_diff = arma::accu(arma::abs(arma::conv_to< arma::rowvec >::from(sub_mat.row(j)) - pot_row)); if (tmp_diff > tol) { - count_diff += 1; } } @@ -581,9 +532,7 @@ namespace clustR { if (count_diff == sub_mat.n_rows) { medoids_dat.row(count_m) = pot_row; - medoids_out(count_m) = shufl_idx(i); - count_m += 1; } } @@ -667,9 +616,7 @@ namespace clustR { for (unsigned int i = 0; i < x.n_cols; i++) { arma::vec tmp_vec = arma::conv_to< arma::vec >::from(x.col(i)); - double tmp_mean = arma::as_scalar(arma::mean(tmp_vec)); - tot_ss += arma::accu(arma::pow(tmp_vec - tmp_mean, 2)); } @@ -686,42 +633,44 @@ namespace clustR { // https://github.com/scikit-learn/scikit-learn/blob/51a765a/sklearn/cluster/k_means_.py#L660 // - Rcpp::List KMEANS_rcpp(arma::mat& data, int clusters, int num_init = 1, int max_iters = 200, std::string initializer = "kmeans++", bool fuzzy = false, bool verbose = false, - - Rcpp::Nullable CENTROIDS = R_NilValue, double tol = 1e-4, double eps = 1.0e-6, double tol_optimal_init = 0.5, int seed = 1) { + Rcpp::List KMEANS_rcpp(arma::mat& data, + int clusters, + int num_init = 1, + int max_iters = 200, + std::string initializer = "kmeans++", + bool fuzzy = false, + bool verbose = false, + Rcpp::Nullable CENTROIDS = R_NilValue, + double tol = 1e-4, + double eps = 1.0e-6, + double tol_optimal_init = 0.5, + int seed = 1) { int dat_n_rows = data.n_rows; - - if (clusters > dat_n_rows - 2 || clusters < 1) { Rcpp::stop("the number of clusters should be at most equal to nrow(data) - 2 and never less than 1"); } + if (clusters > dat_n_rows - 2 || clusters < 1) { + Rcpp::stop("the number of clusters should be at most equal to nrow(data) - 2 and never less than 1"); + } set_seed(seed); // R's RNG - bool flag = false; - arma::mat CENTROIDS1; if (CENTROIDS.isNotNull()) { - CENTROIDS1 = Rcpp::as(CENTROIDS); - num_init = 1; - flag = true; } arma::rowvec lst_out; - arma::mat centers_out, lst_fuzzy_out; - arma::rowvec bst_obs(clusters, arma::fill::zeros); - arma::rowvec bst_WCSS(clusters); - bst_WCSS.fill(arma::datum::inf); // initialize WCSS to Inf, so that in first iteration it can be compared with the minimum of the 'WSSE' - int end_init = 0; - if (verbose) { Rcpp::Rcout << " " << std::endl; } + if (verbose) { + Rcpp::Rcout << " " << std::endl; + } arma::rowvec flag_exception(num_init, arma::fill::zeros); @@ -732,54 +681,36 @@ namespace clustR { if (!flag) { if (initializer == "kmeans++") { - - conv = kmeans_pp_init(data, clusters, false);} - + conv = kmeans_pp_init(data, clusters, false); + } if (initializer == "random") { - arma::uvec samp = arma::conv_to< arma::uvec >::from(sample_vec(clusters, 0, data.n_rows - 1, false)); - conv = data.rows(samp); } - if (initializer == "quantile_init") { - int samp_ROWS = data.n_rows / 10; - arma::uvec tmp_conv = quantile_init_rcpp(data, samp_ROWS, clusters); - flag_exception(init_count) = duplicated_flag(tmp_conv); // print a warning in case of duplicated centroids - conv = data.rows(tmp_conv); } - if (initializer == "optimal_init") { - arma::vec tmp_conv = check_medoids(data, clusters, tol_optimal_init); // tolerance parameter 'tol' here by default equals to 0.5 [ this parameter is important in case of duplicated rows ] - flag_exception(init_count) = !arma::is_finite(tmp_conv); // necessary in order to stop the function in case of UBSAN memory errors - arma::uvec idx_out = arma::conv_to< arma::uvec >::from(tmp_conv); - conv = data.rows(idx_out); } } - else { - conv = CENTROIDS1; } arma::mat bst_centers, fuzzy_OUT; - arma::rowvec CLUSTERS_OUT, OBS(clusters, arma::fill::zeros), SSE(clusters, arma::fill::zeros); - int iter = 0; while(true) { arma::mat new_centroids(clusters, data.n_cols, arma::fill::zeros), soft_CLUSTERS(data.n_rows, clusters); - arma::rowvec total_WSSE(clusters, arma::fill::zeros), num_obs(clusters, arma::fill::zeros), CLUSTERS(data.n_rows); for (unsigned int i = 0; i < data.n_rows; i++) { @@ -787,49 +718,39 @@ namespace clustR { arma::vec tmp_vec = WCSS(arma::conv_to< arma::rowvec >::from(data.row(i)), conv); // returns a rowvec with the WSSE for each cluster if (fuzzy) { - soft_CLUSTERS.row(i) = arma::conv_to< arma::rowvec >::from(tmp_vec); } int tmp_idx = MinMat(tmp_vec); // returns the index of the tmp_vec with the lowest WSSE - arma::rowvec tmp_row = data.row(i); - total_WSSE(tmp_idx) += tmp_vec(tmp_idx); // assigns to total_WSSE the minimum cost - num_obs(tmp_idx) += 1; // number of observations in each cluster - new_centroids.row(tmp_idx) += tmp_row; // adds to the corresponding row of the tmp_clusters matrix the row of the data, so that the new centroids can be calculated - CLUSTERS(i) = tmp_idx; } for (int j = 0; j < clusters; j++) { - new_centroids.row(j) /= arma::as_scalar(num_obs(j)); } double tmp_norm = squared_norm(conv - new_centroids); - conv = new_centroids; - if (verbose) { Rcpp::Rcout << "iteration: " << iter + 1 << " --> total WCSS: " << arma::accu(total_WSSE) << " --> squared norm: " << tmp_norm << std::endl; } + if (verbose) { + Rcpp::Rcout << "iteration: " << iter + 1 << " --> total WCSS: " << arma::accu(total_WSSE) << " --> squared norm: " << tmp_norm << std::endl; + } if (tmp_norm < tol || iter == max_iters - 1) { // break, if the squared_norm is less than tol or the iter equals to max_iters CLUSTERS_OUT = CLUSTERS; // update clusters if (fuzzy) { - fuzzy_OUT = soft_CLUSTERS; // update soft clustering } SSE = total_WSSE; // update WSSE - OBS = num_obs; // update number of observations - bst_centers = conv; // update centers - break; } @@ -837,23 +758,16 @@ namespace clustR { } double ACCUMUL_WCSS; - ACCUMUL_WCSS = arma::as_scalar(arma::accu(bst_WCSS)); if (arma::accu(SSE) < ACCUMUL_WCSS) { - end_init = init_count + 1; - bst_WCSS = SSE; - bst_obs = OBS; - centers_out = bst_centers; - lst_out = CLUSTERS_OUT; if (fuzzy) { - lst_fuzzy_out = fuzzy_OUT; } } @@ -866,15 +780,12 @@ namespace clustR { if (exc > 0) { if (initializer == "quantile_init") { - std::string message = "the centroid matrix using 'quantile_init' as initializer contains duplicates for number of clusters equal to : " + std::to_string(clusters); - - Rcpp::warning(message);} + Rcpp::warning(message); + } if (initializer == "optimal_init") { - std::string message = "The centroid matrix using 'optimal_init' as initializer contains NA's. Thus, the 'tol_optimal_init' parameter should be (probably) decreased for number of clusters equal to : " + std::to_string(clusters); - Rcpp::stop(message); } } @@ -886,7 +797,6 @@ namespace clustR { arma::mat fuzzy_mat(lst_fuzzy_out.n_rows, lst_fuzzy_out.n_cols); for (unsigned int i = 0; i < lst_fuzzy_out.n_rows; i++) { - fuzzy_mat.row(i) = norm_fuzzy(arma::conv_to< arma::rowvec >::from(lst_fuzzy_out.row(i)), eps); } @@ -898,12 +808,13 @@ namespace clustR { Rcpp::Named("WCSS_per_cluster") = bst_WCSS, Rcpp::Named("obs_per_cluster") = bst_obs); } - else { - - return Rcpp::List::create(Rcpp::Named("clusters") = lst_out, Rcpp::Named("centers") = centers_out, Rcpp::Named("total_SSE") = tmp_sse, - - Rcpp::Named("best_initialization") = end_init, Rcpp::Named("WCSS_per_cluster") = bst_WCSS, Rcpp::Named("obs_per_cluster") = bst_obs); + return Rcpp::List::create(Rcpp::Named("clusters") = lst_out, + Rcpp::Named("centers") = centers_out, + Rcpp::Named("total_SSE") = tmp_sse, + Rcpp::Named("best_initialization") = end_init, + Rcpp::Named("WCSS_per_cluster") = bst_WCSS, + Rcpp::Named("obs_per_cluster") = bst_obs); } } diff --git a/man/GMM.Rd b/man/GMM.Rd index e99dee1..06ffe57 100644 --- a/man/GMM.Rd +++ b/man/GMM.Rd @@ -59,7 +59,6 @@ dat = as.matrix(dietary_survey_IBS[, -ncol(dietary_survey_IBS)]) dat = center_scale(dat) gmm = GMM(dat, 2, "maha_dist", "random_subset", 10, 10) - } \references{ http://arma.sourceforge.net/docs.html diff --git a/tests/testthat/test-gmm.R b/tests/testthat/test-gmm.R index b50003f..c0e5d75 100644 --- a/tests/testthat/test-gmm.R +++ b/tests/testthat/test-gmm.R @@ -542,10 +542,9 @@ testthat::test_that("in case that the verbose parameter is not logical, it retur }) -# testthat::test_that("in case that max_clusters is greater than the number of the columns and verbose is TRUE, it returns a warning", { -# -# testthat::expect_warning( Optimal_Clusters_GMM(X, ncol(X) + 1, dist_mode = 'eucl_dist', criterion = "BIC", seed_mode = 'random_subset', km_iter = 5, +# testthat::test_that("in case that max_clusters is greater than the number of rows and verbose is TRUE, it returns a warning", { # +# testthat::expect_warning( Optimal_Clusters_GMM(X, nrow(X) + 1, dist_mode = 'eucl_dist', criterion = "BIC", seed_mode = 'random_subset', km_iter = 5,# # em_iter = 5, plot_data = F, verbose = T) ) # })