rename 'inequity' -> 'inequality'; update docs

gagolews · Sep 22, 2023 · 42bd2bc · 42bd2bc
1 parent 1c08302
commit 42bd2bc
Show file tree

Hide file tree

Showing 73 changed files with 520 additions and 549 deletions.
diff --git a/.devel/pytest/test_disjoint_sets.py b/.devel/pytest/test_disjoint_sets.py
@@ -1,5 +1,5 @@
 import numpy as np
-from genieclust.inequity import *
+from genieclust.inequality import *
 from genieclust.internal import DisjointSets
 from genieclust.internal import GiniDisjointSets
 import time

diff --git a/.devel/pytest/test_inequity.py → .devel/pytest/test_inequality.py b/.devel/pytest/test_inequity.py → .devel/pytest/test_inequality.py
@@ -1,9 +1,9 @@
 import numpy as np
 import genieclust
-from genieclust.inequity import *
+from genieclust.inequality import *
 
 # np.random.seed(123)
-def test_inequity():
+def test_inequality():
     def gini_ref(x):
         n = len(x)
         s = 0.0
@@ -63,4 +63,4 @@ def devergottini_ref(x):
                 assert devergottini_index(x[::2], True) == devergottini_index(np.array(x[::2]))
 
 if __name__ == "__main__":
-    test_inequity()
+    test_inequality()
diff --git a/.devel/sandbox_parallel.py b/.devel/sandbox_parallel.py
@@ -1,6 +1,6 @@
 import numpy as np
 from genieclust.genie import *
-from genieclust.inequity import *
+from genieclust.inequality import *
 from genieclust.compare_partitions import *
 import time
 import gc, os

diff --git a/.devel/sphinx/genieclust.rst b/.devel/sphinx/genieclust.rst
@@ -8,7 +8,7 @@ Python Package `genieclust` Reference
     genieclust.GIc
     genieclust.cluster_validity
     genieclust.compare_partitions
-    genieclust.inequity
+    genieclust.inequality
     genieclust.internal
     genieclust.plots
     genieclust.tools
@@ -23,7 +23,7 @@ Python Package `genieclust` Reference
     genieclust_gic
     genieclust_cluster_validity
     genieclust_compare_partitions
-    genieclust_inequity
+    genieclust_inequality
     genieclust_internal
     genieclust_plots
     genieclust_tools
diff --git a/.devel/sphinx/genieclust_inequality.rst b/.devel/sphinx/genieclust_inequality.rst
@@ -0,0 +1,5 @@
+genieclust.inequality
+===================
+
+.. automodule:: genieclust.inequality
+    :members:
diff --git a/.devel/sphinx/genieclust_inequity.rst b/.devel/sphinx/genieclust_inequity.rst
diff --git a/.devel/sphinx/news.md b/.devel/sphinx/news.md
@@ -3,6 +3,9 @@
 
 ## 1.1.4.9xxx
 
+* [BACKWARD INCOMPATIBILITY] [Python and R] Inequality measures
+    are no longer referred to as inequity measures.
+
 * [BACKWARD INCOMPATIBILITY] [Python and R]
     Some external cluster validity measures were renamed
     (as per the major revision of <https://doi.org/10.48550/arXiv.2209.02935>):
@@ -110,9 +113,7 @@
 *  [Python] `plots.plot_scatter` now uses a more accessible default palette
    (from R 4.0.0).
 
-*  [Python] New function: `inequity.devergottini_index`.
-
-*  [R] New function: `devergottini_index`.
+*  [Python and R] New function: `devergottini_index`.
 
 
 ## 1.0.0 (2021-04-22)

diff --git a/.devel/sphinx/rapi.md b/.devel/sphinx/rapi.md
@@ -9,6 +9,6 @@ rapi/compare_partitions
 rapi/emst_mlpack
 rapi/gclust
 rapi/genieclust-package
-rapi/inequity
+rapi/inequality
 rapi/mst
 ```
diff --git a/.devel/sphinx/rapi/compare_partitions.md b/.devel/sphinx/rapi/compare_partitions.md
@@ -4,7 +4,11 @@
 
 The functions described in this section quantify the similarity between two label vectors `x` and `y` which represent two partitions of a set of $n$ elements into, respectively, $K$ and $L$ nonempty and pairwise disjoint subsets.
 
-For instance, `x` and `y` can represent two clusterings of a dataset with $n$ observations specified by two vectors of labels. The functions described here can be used as external cluster validity measures, where we assume that `x` is a reference (ground-truth) partition.
+For instance, `x` and `y` can represent two clusterings of a dataset with $n$ observations specified by two vectors of labels. The functions described here can be used as external cluster validity measures, where we assume that `x` is a reference (ground-truth) partition whilst `y` is the vector of predicted cluster memberships.
+
+All indices except `normalized_clustering_accuracy()` can act as a pairwise partition similarity score: they are symmetric, i.e., `index(x, y) == index(y, x)`.
+
+Each index except `mi_score()` (which computes the mutual information score) outputs 1 given two identical partitions. Note that partitions are always defined up to a permutation (bijection) of the set of possible labels, e.g., (1, 1, 2, 1) and (4, 4, 2, 4) represent the same 2-partition.
 
 ## Usage
 
@@ -45,25 +49,19 @@ normalizing_permutation(x, y = NULL)
 
 ## Details
 
-Each index except `normalized_clustering_accuracy()` can act as a pairwise partition similarity score: it is symmetric, i.e., `index(x, y) == index(y, x)`.
-
-Each index except `mi_score()` (which computes the mutual information score) outputs 1 given two identical partitions. Note that partitions are always defined up to a permutation (bijection) of the set of possible labels, e.g., (1, 1, 2, 1) and (4, 4, 2, 4) represent the same 2-partition.
-
-`normalized_clustering_accuracy()` (Gagolewski, 2023) is an external cluster validity measure which assumes that the label vector `x` (or rows in the confusion matrix) represents the reference (ground truth) partition. It is an average proportion of correctly classified points in each cluster above the worst case scenario of uniform membership assignment, with cluster matching based on the solution to the maximal linear sum assignment problem; see [`normalized_confusion_matrix`](compare_partitions.md)). It is given by: $\max_\sigma \frac{1}{K} \sum_{i=1}^K \frac{c_{i, \sigma(i)}-c_{i,\cdot}/k}{c_{i,\cdot}-c_{i,\cdot}/k}$, where $C$ is a confusion matrix and $c_{i, \cdot}=c_{i, 1}+...+c_{i, K}$ is the i-th row sum. We assume that $K\ge L$.
-
-`normalized_pivoted_accuracy()` is defined as $(Accuracy(C_\sigma)-1/max(K,L))/(1-1/max(K,L))$, where $C_\sigma$ is a version of the confusion matrix for given `x` and `y` with columns permuted based on the solution to the maximal linear sum assignment problem. The $Accuracy(C_\sigma)$ part is sometimes referred to as set-matching classification rate or pivoted accuracy.
+`normalized_clustering_accuracy()` (Gagolewski, 2023) is an asymmetric external cluster validity measure which assumes that the label vector `x` (or rows in the confusion matrix) represents the reference (ground truth) partition. It is an average proportion of correctly classified points in each cluster above the worst case scenario of uniform membership assignment, with cluster ID matching based on the solution to the maximal linear sum assignment problem; see [`normalized_confusion_matrix`](compare_partitions.md)). It is given by: $\max_\sigma \frac{1}{K} \sum_{j=1}^K \frac{c_{\sigma(j), j}-c_{\sigma(j),\cdot}/K}{c_{\sigma(j),\cdot}-c_{\sigma(j),\cdot}/K}$, where $C$ is a confusion matrix with $K$ rows and $L$ columns, $\sigma$ is a permutation of the set $\{1,\dots,\max(K,L)\}$, and $c_{i, \cdot}=c_{i, 1}+...+c_{i, L}$ is the i-th row sum, under the assumption that $c_{i,j}=0$ for $i>K$ or $j>L$ and $0/0=0$.
 
-`pair_sets_index()` gives the pair sets index (PSI) (Rezaei, Franti, 2016). Pairing is based on the solution to the linear sum assignment problem of a transformed version of the confusion matrix. For non-square matrices, missing rows/columns are assumed to be filled with 0s. The simplified PSI assumes E=1 in the definition of the index, i.e., uses Eq. (20) in the said paper instead of Eq. (18).
+`normalized_pivoted_accuracy()` is defined as $(\max_\sigma \sum_{j=1}^{\max(K,L)} c_{\sigma(j),j}/n-1/\max(K,L))/(1-1/\max(K,L))$, where $\sigma$ is a permutation of the set $\{1,\dots,\max(K,L)\}$, and $n$ is the sum of all elements in $C$. For non-square matrices, missing rows/columns are assumed to be filled with 0s.
 
-`rand_score()` gives the Rand score (the \"probability\" of agreement between the two partitions) and `adjusted_rand_score()` is its version corrected for chance, see (Hubert, Arabie, 1985), its expected value is 0.0 given two independent partitions. Due to the adjustment, the resulting index might also be negative for some inputs.
+`pair_sets_index()` (PSI) was introduced in (Rezaei, Franti, 2016). The simplified PSI assumes E=1 in the definition of the index, i.e., uses Eq. (20) in the said paper instead of Eq. (18). For non-square matrices, missing rows/columns are assumed to be filled with 0s.
 
-Similarly, `fm_score()` gives the Fowlkes-Mallows (FM) score and `adjusted_fm_score()` is its adjusted-for-chance version, see (Hubert, Arabie, 1985).
+`rand_score()` gives the Rand score (the \"probability\" of agreement between the two partitions) and `adjusted_rand_score()` is its version corrected for chance, see (Hubert, Arabie, 1985): its expected value is 0 given two independent partitions. Due to the adjustment, the resulting index may be negative for some inputs.
 
-Note that both the (unadjusted) Rand and FM scores are bounded from below by $1/(K+1)$ if $K=L$, hence their adjusted versions are preferred.
+Similarly, `fm_score()` gives the Fowlkes-Mallows (FM) score and `adjusted_fm_score()` is its adjusted-for-chance version; see (Hubert, Arabie, 1985).
 
 `mi_score()`, `adjusted_mi_score()` and `normalized_mi_score()` are information-theoretic scores, based on mutual information, see the definition of $AMI_{sum}$ and $NMI_{sum}$ in (Vinh et al., 2010).
 
-`normalized_confusion_matrix()` computes the confusion matrix and permutes its rows and columns so that the sum of the elements of the main diagonal is the largest possible (by solving the maximal assignment problem). The function only accepts $K \leq L$. The sole reordering of the columns of a confusion matrix can be determined by calling `normalizing_permutation()`.
+`normalized_confusion_matrix()` computes the confusion matrix and permutes its rows and columns so that the sum of the elements of the main diagonal is the largest possible (by solving the maximal assignment problem). The function only accepts $K \leq L$. The reordering of the columns of a confusion matrix can be determined by calling `normalizing_permutation()`.
 
 Also note that the built-in [`table()`](https://stat.ethz.ch/R-manual/R-devel/library/base/html/table.html) determines the standard confusion matrix.
 

diff --git a/.devel/sphinx/rapi/gclust.md b/.devel/sphinx/rapi/gclust.md
@@ -2,7 +2,7 @@
 
 ## Description
 
-A reimplementation of *Genie* - a robust and outlier resistant clustering algorithm (see Gagolewski, Bartoszuk, Cena, 2016). The Genie algorithm is based on a minimum spanning tree (MST) of the pairwise distance graph of a given point set. Just like the single linkage, it consumes the edges of the MST in an increasing order of weights. However, it prevents the formation of clusters of highly imbalanced sizes; once the Gini index (see [`gini_index()`](inequity.md)) of the cluster size distribution raises above `gini_threshold`, a forced merge of a point group of the smallest size is performed. Its appealing simplicity goes hand in hand with its usability; Genie often outperforms other clustering approaches on benchmark data, such as <https://github.com/gagolews/clustering-benchmarks>.
+A reimplementation of *Genie* - a robust and outlier resistant clustering algorithm (see Gagolewski, Bartoszuk, Cena, 2016). The Genie algorithm is based on a minimum spanning tree (MST) of the pairwise distance graph of a given point set. Just like the single linkage, it consumes the edges of the MST in an increasing order of weights. However, it prevents the formation of clusters of highly imbalanced sizes; once the Gini index (see [`gini_index()`](inequality.md)) of the cluster size distribution raises above `gini_threshold`, a forced merge of a point group of the smallest size is performed. Its appealing simplicity goes hand in hand with its usability; Genie often outperforms other clustering approaches on benchmark data, such as <https://github.com/gagolews/clustering-benchmarks>.
 
 The clustering can now also be computed with respect to the mutual reachability distance (based, e.g., on the Euclidean metric), which is used in the definition of the HDBSCAN\* algorithm (see Campello et al., 2013). If `M` \> 1, then the mutual reachability distance $m(i,j)$ with smoothing factor `M` is used instead of the chosen \"raw\" distance $d(i,j)$. It holds $m(i,j)=\max(d(i,j), c(i), c(j))$, where $c(i)$ is $d(i,k)$ with $k$ being the (`M`-1)-th nearest neighbour of $i$. This makes \"noise\" and \"boundary\" points being \"pulled away\" from each other.
 

diff --git a/.devel/sphinx/rapi/inequity.md → .devel/sphinx/rapi/inequality.md b/.devel/sphinx/rapi/inequity.md → .devel/sphinx/rapi/inequality.md
@@ -1,4 +1,4 @@
-# inequity: Inequity (Inequality) Measures
+# inequality: Inequality Measures
 
 ## Description
 
@@ -22,7 +22,7 @@ devergottini_index(x)
 
 ## Details
 
-These indices can be used to quantify the \"inequity\" of a numeric sample. They can be perceived as measures of data dispersion. For constant vectors (perfect equity), the indices yield values of 0. Vectors with all elements but one equal to 0 (perfect inequity), are assigned scores of 1. They follow the Pigou-Dalton principle (are Schur-convex): setting $x_i = x_i - h$ and $x_j = x_j + h$ with $h > 0$ and $x_i - h \geq x_j + h$ (taking from the \"rich\" and giving to the \"poor\") decreases the inequity.
+These indices can be used to quantify the \"inequality\" of a numeric sample. They can be conceived as normalised measures of data dispersion. For constant vectors (perfect equity), the indices yield values of 0. Vectors with all elements but one equal to 0 (perfect inequality), are assigned scores of 1. They follow the Pigou-Dalton principle (are Schur-convex): setting $x_i = x_i - h$ and $x_j = x_j + h$ with $h > 0$ and $x_i - h \geq x_j + h$ (taking from the \"rich\" and giving to the \"poor\") decreases the inequality
 
 These indices have applications in economics, amongst others. The Genie clustering algorithm uses the Gini index as a measure of the inequality of cluster sizes.
 
@@ -60,7 +60,7 @@ Time complexity: $O(n)$ for sorted (increasingly) data. Otherwise, the vector wi
 
 ## Value
 
-The value of the inequity index, a number in $[0, 1]$.
+The value of the inequality index, a number in $[0, 1]$.
 
 ## Author(s)
 

diff --git a/.devel/sphinx/weave/basics.rst b/.devel/sphinx/weave/basics.rst
@@ -4,7 +4,7 @@ Basics
 
 *Genie* :cite:`genieins` is an agglomerative hierarchical clustering
 algorithm that links clusters minding that
-the Gini index (a popular measure of inequity used in, amongst others,
+the Gini index (a popular measure of inequality used in, amongst others,
 economics) of the cluster sizes should not go too far beyond a given threshold.
 If this happens, instead of merging two closest clusters, a smallest cluster
 is joined with its nearest neighbour.

diff --git a/.devel/sphinx/weave/basics.rstw b/.devel/sphinx/weave/basics.rstw
@@ -4,7 +4,7 @@ Basics
 
 *Genie* :cite:`genieins` is an agglomerative hierarchical clustering
 algorithm that links clusters minding that
-the Gini index (a popular measure of inequity used in, amongst others,
+the Gini index (a popular measure of inequality used in, amongst others,
 economics) of the cluster sizes should not go too far beyond a given threshold.
 If this happens, instead of merging two closest clusters, a smallest cluster
 is joined with its nearest neighbour.

diff --git a/.devel/sphinx/weave/figures/basics_basics-plot-km_1.png b/.devel/sphinx/weave/figures/basics_basics-plot-km_1.png
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -24,7 +24,7 @@ Description: A retake on the Genie algorithm
     'HDBSCAN*' (that is able to detect a predefined number of
     clusters and hence it does not dependent on the somewhat
     fragile 'eps' parameter).
-    The package also features an implementation of economic inequity indices
+    The package also features an implementation of inequality indices
     (the Gini, Bonferroni index), external cluster validity measures
     (e.g., the normalised clustering accuracy and partition similarity scores
     such as the adjusted Rand, Fowlkes-Mallows, adjusted mutual information,

diff --git a/MANIFEST b/MANIFEST
@@ -13,14 +13,14 @@ genieclust/c_cvi.pxd
 genieclust/c_disjoint_sets.pxd
 genieclust/c_genie.pxd
 genieclust/c_gini_disjoint_sets.pxd
-genieclust/c_inequity.pxd
+genieclust/c_inequality.pxd
 genieclust/c_mst.pxd
 genieclust/c_postprocess.pxd
 genieclust/c_preprocess.pxd
 genieclust/cluster_validity.pyx
 genieclust/compare_partitions.pyx
 genieclust/genie.py
-genieclust/inequity.pyx
+genieclust/inequality.pyx
 genieclust/internal.pyx
 genieclust/plots.py
 genieclust/tools.pyx
@@ -32,7 +32,7 @@ src/c_disjoint_sets.h
 src/c_distance.h
 src/c_genie.h
 src/c_gini_disjoint_sets.h
-src/c_inequity.h
+src/c_inequality.h
 src/c_int_dict.h
 src/c_matrix.h
 src/c_mst.h

diff --git a/NEWS b/NEWS
@@ -3,6 +3,9 @@
 
 ## 1.1.4.9xxx
 
+* [BACKWARD INCOMPATIBILITY] [Python and R] Inequality measures
+    are no longer referred to as inequity measures.
+
 * [BACKWARD INCOMPATIBILITY] [Python and R]
     Some external cluster validity measures were renamed
     (as per the major revision of <https://doi.org/10.48550/arXiv.2209.02935>):
@@ -110,9 +113,7 @@
 *  [Python] `plots.plot_scatter` now uses a more accessible default palette
    (from R 4.0.0).
 
-*  [Python] New function: `inequity.devergottini_index`.
-
-*  [R] New function: `devergottini_index`.
+*  [Python and R] New function: `devergottini_index`.
 
 
 ## 1.0.0 (2021-04-22)