more

gagolews · Jul 25, 2024 · 28ba6bd · 28ba6bd
1 parent 53c554d
commit 28ba6bd
Show file tree

Hide file tree

Showing 88 changed files with 2,632 additions and 980 deletions.
diff --git a/.devel/sphinx/bibliography.bib b/.devel/sphinx/bibliography.bib
@@ -1,21 +1,21 @@
-@misc{clustering-msts,
+@article{cvimst,
     author = {M. Gagolewski and A. Cena and M. Bartoszuk and L. Brzozowski},
     title = {Clustering with minimum spanning trees: {H}ow good can it be?},
-    year = {2023},
-    publisher = {arXiv},
-    doi = {10.48550/arXiv.2303.05679},
-    url = {https://arxiv.org/pdf/2303.05679.pdf},
-    note = {under review (preprint)}
+    journal = {Journal of Classification},
+    year = {2024},
+    url = {https://link.springer.com/content/pdf/10.1007/s00357-024-09483-1.pdf},
+    doi = {10.1007/s00357-024-09483-1},
+    note = {in press}
 }
 
-@misc{nca,
+@article{nca,
     author = {M. Gagolewski},
     title = {Normalised clustering accuracy: {A}n asymmetric external cluster validity measure},
-    year = {2023},
-    publisher = {arXiv},
-    doi = {10.48550/arXiv.2209.02935},
-    url = {https://arxiv.org/pdf/2209.02935.pdf},
-    note = {under review (preprint)}
+    journal = {Journal of Classification},
+    year = {2024},
+    url = {https://link.springer.com/content/pdf/10.1007/s00357-024-09482-2.pdf},
+    doi = {10.1007/s00357-024-09482-2},
+    note = {in press}
 }
 
 @misc{Gagolewski2022:clustering-data-v1.1.0,
@@ -45,14 +45,14 @@ @book{datawranglingpy
     publisher = {Zenodo},
     address = {Melbourne},
     url = {https://datawranglingpy.gagolewski.com/},
-    year = {2022}
+    year = {2024}
 }
 
 @book{deepr,
     author = {M. Gagolewski},
     title = {Deep {R} Programming},
     url = {https://deepr.gagolewski.com/},
-    year = {2023},
+    year = {2024},
     doi = {10.5281/zenodo.7490464},
     isbn = {978-0-6455719-2-9},
     publisher = {Zenodo},
@@ -98,6 +98,7 @@ @article{cvi
     year = {2021},
     pages = {620--636},
     volume = {581},
+    doi = {10.1016/j.ins.2021.10.004},
     url = {https://arxiv.org/pdf/2208.01261}
 }
 

diff --git a/.devel/sphinx/index.md b/.devel/sphinx/index.md
@@ -28,7 +28,7 @@ Of course, there is no, nor will there ever be, a single best
 universal clustering approach for every kind of problem, but Genie
 is definitely worth a try!
 
-Thanks to its being based on minimal spanning trees {cite}`clustering-msts`
+Thanks to its being based on minimal spanning trees {cite}`cvimst`
 of the pairwise distance graphs, Genie is also **very fast** — determining
 the whole cluster hierarchy for datasets of millions of points, can be
 completed within {any}`minutes <weave/timings>`.
@@ -115,7 +115,7 @@ The implemented algorithms include:
 
 -   *GIc* (*Genie+Information Criterion*) –
     a heuristic agglomerative algorithm {cite}`cenaphd` to minimise the information
-    theoretic criterion {cite}`itm`; see {cite}`clustering-msts`
+    theoretic criterion {cite}`itm`; see {cite}`cvimst`
     *(Python only)*.
 
 Other features:

diff --git a/.devel/sphinx/news.md b/.devel/sphinx/news.md
@@ -11,8 +11,7 @@
     are no longer referred to as inequity measures.
 
 * [BACKWARD INCOMPATIBILITY] [Python and R]
-    Some external cluster validity measures were renamed
-    (as per the major revision of <https://doi.org/10.48550/arXiv.2209.02935>):
+    Some external cluster validity measures were renamed:
     `adjusted_asymmetric_accuracy` -> `normalized_clustering_accuracy`,
     `normalized_accuracy` -> `normalized_pivoted_accuracy`.
 
@@ -38,8 +37,8 @@
 ## 1.1.4 (2023-03-31)
 
 *  [Python] The GIc algorithm is no longer marked as experimental;
-   its description will be provided in a forthcoming paper; see
-   <https://doi.org/10.48550/arXiv.2303.05679>.
+   its description is provided in
+   <https://doi.org/10.1007/s00357-024-09483-1>.
 
 
 ## 1.1.3 (2023-01-17)

diff --git a/.devel/sphinx/rapi/cluster_validity.md b/.devel/sphinx/rapi/cluster_validity.md
@@ -36,14 +36,14 @@ wcnn_index(X, y, M = 25L)
 
 ## Arguments
 
-|                                    |                                                                                                                                                                                                                                                                         |
-|------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `X`                                | numeric matrix with `n` rows and `d` columns, representing `n` points in a `d`-dimensional space                                                                                                                                                                        |
-| `y`                                | vector of `n` integer labels, representing a partition whose *quality* is to be assessed; `y[i]` is the cluster ID of the `i`-th point, `X[i, ]`; `1 <= y[i] <= K`, where `K` is the number or clusters                                                                 |
-| `M`                                | number of nearest neighbours                                                                                                                                                                                                                                            |
-| `owa_numerator`, `owa_denominator` | single string specifying the OWA operators to use in the definition of the DuNN index; one of: `"Mean"`, `"Min"`, `"Max"`, `"Const"`, `"SMin:D"`, `"SMax:D"`, where `D` is an integer defining the degree of smoothness                                                 |
-| `lowercase_d`                      | an integer between 1 and 5, denoting $d_1$, \..., $d_5$ in the definition of the generalised Dunn (Bezdek-Pal) index (numerator: min, max, and mean pairwise intracluster distance, distance between cluster centroids, weighted point-centroid distance, respectively) |
-| `uppercase_d`                      | an integer between 1 and 3, denoting $D_1$, \..., $D_3$ in the definition of the generalised Dunn (Bezdek-Pal) index (denominator: max and min pairwise intracluster distance, average point-centroid distance, respectively)                                           |
+|  |  |
+|----|----|
+| `X` | numeric matrix with `n` rows and `d` columns, representing `n` points in a `d`-dimensional space |
+| `y` | vector of `n` integer labels, representing a partition whose *quality* is to be assessed; `y[i]` is the cluster ID of the `i`-th point, `X[i, ]`; `1 <= y[i] <= K`, where `K` is the number or clusters |
+| `M` | number of nearest neighbours |
+| `owa_numerator`, `owa_denominator` | single string specifying the OWA operators to use in the definition of the DuNN index; one of: `"Mean"`, `"Min"`, `"Max"`, `"Const"`, `"SMin:D"`, `"SMax:D"`, where `D` is an integer defining the degree of smoothness |
+| `lowercase_d` | an integer between 1 and 5, denoting $d_1$, \..., $d_5$ in the definition of the generalised Dunn (Bezdek-Pal) index (numerator: min, max, and mean pairwise intracluster distance, distance between cluster centroids, weighted point-centroid distance, respectively) |
+| `uppercase_d` | an integer between 1 and 3, denoting $D_1$, \..., $D_3$ in the definition of the generalised Dunn (Bezdek-Pal) index (denominator: max and min pairwise intracluster distance, average point-centroid distance, respectively) |
 
 ## Value
 
@@ -67,7 +67,7 @@ Dunn J.C., A Fuzzy Relative of the ISODATA Process and Its Use in Detecting Comp
 
 Gagolewski M., Bartoszuk M., Cena A., Are cluster validity measures (in)valid?, *Information Sciences* 581, 620-636, 2021, [doi:10.1016/j.ins.2021.10.004](https://doi.org/10.1016/j.ins.2021.10.004); preprint: <https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2021cvi.pdf>.
 
-Gagolewski M., *A Framework for Benchmarking Clustering Algorithms*, 2022, <https://clustering-benchmarks.gagolewski.com>.
+Gagolewski M., A Framework for Benchmarking Clustering Algorithms, *SoftwareX* 20, 2022, 101270, [doi:10.1016/j.softx.2022.101270](https://doi.org/10.1016/j.softx.2022.101270), <https://clustering-benchmarks.gagolewski.com>.
 
 Rousseeuw P.J., Silhouettes: A Graphical Aid to the Interpretation and Validation of Cluster Analysis, *Computational and Applied Mathematics* 20, 1987, 53-65, [doi:10.1016/0377-0427(87)90125-7](https://doi.org/10.1016/0377-0427%2887%2990125-7).
 
@@ -82,7 +82,7 @@ Gagolewski M., <span class="pkg">genieclust</span>: Fast and robust hierarchical
 
 
 
-```r
+``` r
 X <- as.matrix(iris[,1:4])
 X[,] <- jitter(X)  # otherwise we get a non-unique solution
 y <- as.integer(iris[[5]])
@@ -93,7 +93,7 @@ calinski_harabasz_index(X, y)  # good
 ## [1] 486.6681
 ```
 
-```r
+``` r
 calinski_harabasz_index(X, sample(1:3, nrow(X), replace=TRUE))  # bad
 ```
 

diff --git a/.devel/sphinx/rapi/compare_partitions.md b/.devel/sphinx/rapi/compare_partitions.md
@@ -40,12 +40,12 @@ normalizing_permutation(x, y = NULL)
 
 ## Arguments
 
-|              |                                                                                                                                                                                                                                                                           |
-|--------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `x`          | an integer vector of length n (or an object coercible to) representing a K-partition of an n-set (e.g., a reference partition), or a confusion matrix with K rows and L columns (see [`table(x, y)`](https://stat.ethz.ch/R-manual/R-devel/library/base/html/table.html)) |
-| `y`          | an integer vector of length n (or an object coercible to) representing an L-partition of the same set (e.g., the output of a clustering algorithm we wish to compare with `x`), or NULL (if x is an K\*L confusion matrix)                                                |
-| `simplified` | whether to assume E=1 in the definition of the pair sets index index, i.e., use Eq. (20) in (Rezaei, Franti, 2016) instead of Eq. (18)                                                                                                                                    |
-| `clipped`    | whether the result should be clipped to the unit interval, i.e., \[0, 1\]                                                                                                                                                                                                 |
+|  |  |
+|----|----|
+| `x` | an integer vector of length n (or an object coercible to) representing a K-partition of an n-set (e.g., a reference partition), or a confusion matrix with K rows and L columns (see [`table(x, y)`](https://stat.ethz.ch/R-manual/R-devel/library/base/html/table.html)) |
+| `y` | an integer vector of length n (or an object coercible to) representing an L-partition of the same set (e.g., the output of a clustering algorithm we wish to compare with `x`), or NULL (if x is an K\*L confusion matrix) |
+| `simplified` | whether to assume E=1 in the definition of the pair sets index index, i.e., use Eq. (20) in (Rezaei, Franti, 2016) instead of Eq. (18) |
+| `clipped` | whether the result should be clipped to the unit interval, i.e., \[0, 1\] |
 
 ## Details
 
@@ -79,9 +79,9 @@ Each cluster validity measure is a single numeric value.
 
 ## References
 
-Gagolewski M., *A Framework for Benchmarking Clustering Algorithms*, 2022, <https://clustering-benchmarks.gagolewski.com>.
+Gagolewski M., A framework for benchmarking clustering algorithms, *SoftwareX* 20, 2022, 101270, [doi:10.1016/j.softx.2022.101270](https://doi.org/10.1016/j.softx.2022.101270), <https://clustering-benchmarks.gagolewski.com>.
 
-Gagolewski M., Normalised clustering accuracy: An asymmetric external cluster validity measure, 2023, under review (preprint), [doi:10.48550/arXiv.2209.02935](https://doi.org/10.48550/arXiv.2209.02935).
+Gagolewski M., Normalised clustering accuracy: An asymmetric external cluster validity measure, *Journal of Classification*, 2024, in press, [doi:10.1007/s00357-024-09482-2](https://doi.org/10.1007/s00357-024-09482-2).
 
 Hubert L., Arabie P., Comparing partitions, *Journal of Classification* 2(1), 1985, 193-218, esp. Eqs. (2) and (4).
 
@@ -104,7 +104,7 @@ Gagolewski M., <span class="pkg">genieclust</span>: Fast and robust hierarchical
 
 
 
-```r
+``` r
 y_true <- iris[[5]]
 y_pred <- kmeans(as.matrix(iris[1:4]), 3)$cluster
 normalized_clustering_accuracy(y_true, y_pred)
@@ -114,87 +114,87 @@ normalized_clustering_accuracy(y_true, y_pred)
 ## [1] 0.84
 ```
 
-```r
+``` r
 normalized_pivoted_accuracy(y_true, y_pred)
 ```
 
 ```
 ## [1] 0.84
 ```
 
-```r
+``` r
 pair_sets_index(y_true, y_pred)
 ```
 
 ```
 ## [1] 0.7568238
 ```
 
-```r
+``` r
 pair_sets_index(y_true, y_pred, simplified=TRUE)
 ```
 
 ```
 ## [1] 0.7470968
 ```
 
-```r
+``` r
 adjusted_rand_score(y_true, y_pred)
 ```
 
 ```
 ## [1] 0.7302383
 ```
 
-```r
+``` r
 rand_score(table(y_true, y_pred)) # the same
 ```
 
 ```
 ## [1] 0.8797315
 ```
 
-```r
+``` r
 adjusted_fm_score(y_true, y_pred)
 ```
 
 ```
 ## [1] 0.7304411
 ```
 
-```r
+``` r
 fm_score(y_true, y_pred)
 ```
 
 ```
 ## [1] 0.8208081
 ```
 
-```r
+``` r
 mi_score(y_true, y_pred)
 ```
 
 ```
 ## [1] 0.8255911
 ```
 
-```r
+``` r
 normalized_mi_score(y_true, y_pred)
 ```
 
 ```
 ## [1] 0.7581757
 ```
 
-```r
+``` r
 adjusted_mi_score(y_true, y_pred)
 ```
 
 ```
 ## [1] 0.7551192
 ```
 
-```r
+``` r
 normalized_confusion_matrix(y_true, y_pred)
 ```
 
@@ -205,7 +205,7 @@ normalized_confusion_matrix(y_true, y_pred)
 ## [3,]    0   14   36
 ```
 
-```r
+``` r
 normalizing_permutation(y_true, y_pred)
 ```
 

diff --git a/.devel/sphinx/rapi/emst_mlpack.md b/.devel/sphinx/rapi/emst_mlpack.md
@@ -12,12 +12,12 @@ emst_mlpack(X, leaf_size = 1, naive = FALSE, verbose = FALSE)
 
 ## Arguments
 
-|             |                                                                                                |
-|-------------|------------------------------------------------------------------------------------------------|
-| `X`         | a numeric matrix (or an object coercible to one, e.g., a data frame with numeric-like columns) |
-| `leaf_size` | size of leaves in the kd-tree, controls the trade-off between speed and memory consumption     |
-| `naive`     | logical; whether to use the naive, quadratic-time algorithm                                    |
-| `verbose`   | logical; whether to print diagnostic messages                                                  |
+|  |  |
+|----|----|
+| `X` | a numeric matrix (or an object coercible to one, e.g., a data frame with numeric-like columns) |
+| `leaf_size` | size of leaves in the kd-tree, controls the trade-off between speed and memory consumption |
+| `naive` | logical; whether to use the naive, quadratic-time algorithm |
+| `verbose` | logical; whether to print diagnostic messages |
 
 ## Value
 

diff --git a/.devel/sphinx/rapi/figure/gclust-1.png b/.devel/sphinx/rapi/figure/gclust-1.png