Added helper functions and fixed mrd memory leak.

mhahsler · Jan 14, 2022 · d347e13 · d347e13
1 parent 16cda5b
commit d347e13
Show file tree

Hide file tree

Showing 34 changed files with 907 additions and 434 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,21 +1,22 @@
 Package: dbscan
-Version: 1.1-9
-Date: 2022-01-10
-Title: Density Based Clustering of Applications with Noise (DBSCAN) and Related
-    Algorithms
+Version: 1.1-10
+Date: 2022-01-14
+Title: Density-Based Spatial Clustering of Applications with Noise (DBSCAN) 
+    and Related Algorithms
 Authors@R: c(person("Michael", "Hahsler", role = c("aut", "cre", "cph"),
                 email = "[email protected]"),
 	    person("Matthew", "Piekenbrock", role = c("aut", "cph")),
 	    person("Sunil", "Arya", role = c("ctb", "cph")),
 	    person("David", "Mount", role = c("ctb", "cph")))
 Description: A fast reimplementation of several density-based algorithms of
-    the DBSCAN family for spatial data. Includes the clustering algorithms 
-    DBSCAN (density-based spatial clustering of applications with noise)
-    and HDBSCAN (hierarchical DBSCAN), the ordering algorithm
-    OPTICS (ordering points to identify the clustering structure), 
-    and the outlier detection algorithm LOF (local outlier factor). 
-    The implementations use the kd-tree data structure (from library ANN) for faster k-nearest neighbor search. 
-    An R interface to fast kNN and fixed-radius NN search is also provided. 
+    the DBSCAN family. Includes the clustering algorithms DBSCAN (density-based 
+    spatial clustering of applications with noise) and HDBSCAN (hierarchical 
+    DBSCAN), the ordering algorithm OPTICS (ordering points to identify the 
+    clustering structure), shared nearest neighbor clustering, and the outlier 
+    detection algorithms LOF (local outlier factor) and GLOSH (global-local 
+    outlier score from hierarchies). The implementations use the kd-tree data 
+    structure (from library ANN) for faster k-nearest neighbor search. An R 
+    interface to fast kNN and fixed-radius NN search is also provided. 
     Hahsler, Piekenbrock and Doran (2019) <doi:10.18637/jss.v091.i01>.
 SystemRequirements: C++11
 Imports:

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,31 +1,45 @@
 useDynLib("dbscan", .registration = TRUE)
 import("Rcpp")
 importFrom("graphics", "plot", "points", "par", "segments",
-  "lines", "polygon", "text")
+  "lines", "polygon", "text", "abline")
 importFrom("grDevices", "palette", "chull", "adjustcolor")
 importFrom("stats", "dist", "hclust", "dendrapply", "as.dendrogram",
   "is.leaf", "prcomp")
+importFrom("utils", "tail")
+
 
 export(
-  adjacencylist,
   kNN,
   kNNdistplot,
   kNNdist,
   frNN,
   sNN,
+  adjacencylist,
+
   dbscan,
+  is.corepoint,
+
   optics,
   extractDBSCAN,
   extractXi,
+
   hdbscan,
   extractFOSC,
+  as.reachability,
+  coredist,
+  mrdist,
+
   sNNclust,
+
   jpclust,
+
   lof,
   glosh,
   pointdensity,
+
   hullplot,
-  as.reachability
+
+  comps
 )
 
 S3method(print, dbscan_fast)
@@ -59,3 +73,8 @@ S3method(sort, sNN)
 S3method(print, sNN)
 
 S3method(plot, NN)
+
+S3method(comps, dist)
+S3method(comps, kNN)
+S3method(comps, sNN)
+S3method(comps, frNN)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,16 @@
+# dbscan 1.1-10 (2022-01-14)
+
+## New Features
+* is.corepoint() for DBSCAN.
+* coredist() and mrdist() for HDBSCAN.
+* find connected components with comps().
+
+# Changes
+* reachability plot now shows all undefined distances as a dashed line.
+
+## Bugfix
+* memory leak in mrd calculation fixed.
+
 # dbscan 1.1-9 (2022-01-10)
 
 ## Changes

diff --git a/R/NN.R b/R/NN.R
@@ -28,7 +28,10 @@
 #' @family NN functions
 #'
 #' @param x a `NN` object
-#' @param ... further parameters
+#' @param pch plotting character.
+#' @param col color used for the data points (nodes).
+#' @param linecol color used for edges.
+#' @param ... further parameters past on to [plot()].
 #' @param decreasing sort in decreasing order?
 #' @param data that was used to create `x`
 #' @param main title
@@ -81,7 +84,7 @@ sort.NN <- function(x, decreasing = FALSE, ...) {
 
 
 #' @rdname NN
-plot.NN <- function(x, data, main = NULL, ...) {
+plot.NN <- function(x, data, main = NULL, pch = 16, col = NULL, linecol = "gray", ...) {
   if (is.null(main)) {
     if (inherits(x, "frNN"))
       main <- paste0("frNN graph (eps = ", x$eps, ")")
@@ -91,7 +94,9 @@ plot.NN <- function(x, data, main = NULL, ...) {
       main <- paste0("Shared NN graph (k=", x$k,
         ifelse(is.null(x$kt), "", paste0(", kt=", x$kt)), ")")
   }
-  plot(data[, 1:2], main = main, ...)
+
+  ## create an empty plot
+  plot(data[, 1:2], main = main, type = "n", pch = pch, col = col, ...)
 
   id <- adjacencylist(x)
 
@@ -101,13 +106,19 @@ plot.NN <- function(x, data, main = NULL, ...) {
     for (i in 1:length(id)) {
       for (j in 1:length(id[[i]]))
         lines(x = c(data[i, 1], data[id[[i]][j], 1]),
-          y = c(data[i, 2], data[id[[i]][j], 2]),
+          y = c(data[i, 2], data[id[[i]][j], 2]), col = linecol,
           ...)
     }
+
+    ## ad vertices
+    points(data[, 1:2], main = main, pch = pch, col = col, ...)
+
   } else {
+    ## ad vertices
+    points(data[, 1:2], main = main, pch = pch, ...)
     ## use colors if it was from a query
     for (i in 1:length(id)) {
-      points(data[id[[i]], ], col = i + 1L)
+      points(data[id[[i]], ], pch = pch, col = i + 1L)
     }
   }
 }
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -13,6 +13,14 @@ ANN_cleanup <- function() {
     invisible(.Call(`_dbscan_ANN_cleanup`))
 }
 
+comps_kNN <- function(nn, mutual) {
+    .Call(`_dbscan_comps_kNN`, nn, mutual)
+}
+
+comps_frNN <- function(nn, mutual) {
+    .Call(`_dbscan_comps_frNN`, nn, mutual)
+}
+
 dbscan_int <- function(data, eps, minPts, weights, borderPoints, type, bucketSize, splitRule, approx, frNN) {
     .Call(`_dbscan_dbscan_int`, data, eps, minPts, weights, borderPoints, type, bucketSize, splitRule, approx, frNN)
 }
@@ -41,6 +49,10 @@ lof_kNN <- function(data, minPts, type, bucketSize, splitRule, approx) {
     .Call(`_dbscan_lof_kNN`, data, minPts, type, bucketSize, splitRule, approx)
 }
 
+mrd <- function(dm, cd) {
+    .Call(`_dbscan_mrd`, dm, cd)
+}
+
 optics_int <- function(data, eps, minPts, type, bucketSize, splitRule, approx, frNN) {
     .Call(`_dbscan_optics_int`, data, eps, minPts, type, bucketSize, splitRule, approx, frNN)
 }
@@ -101,14 +113,6 @@ mst_to_dendrogram <- function(mst) {
     .Call(`_dbscan_mst_to_dendrogram`, mst)
 }
 
-mrd <- function(dm, cd) {
-    .Call(`_dbscan_mrd`, dm, cd)
-}
-
-mrd_m <- function(dm, cd) {
-    .Call(`_dbscan_mrd_m`, dm, cd)
-}
-
 coreFromDist <- function(dist, n, minPts) {
     .Call(`_dbscan_coreFromDist`, dist, n, minPts)
 }

diff --git a/R/comps.R b/R/comps.R
@@ -0,0 +1,88 @@
+#######################################################################
+# dbscan - Density Based Clustering of Applications with Noise
+#          and Related Algorithms
+# Copyright (C) 2017 Michael Hahsler
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+#' Find Connected Components in a NN Graph
+#'
+#' Generic function and methods to find connected components in nearest neighbor graphs.
+#'
+#' Note that for kNN graphs, one point may be in the kNN of the other but nor vice versa.
+#' `mutual = TRUE` requires that both points are in each other's kNN.
+#'
+#' @family NN functions
+#' @aliases components
+#'
+#' @param x the [NN] object representing the graph or a [dist] object
+#' @param eps threshold on the distance
+#' @param mutual for a pair of points, do both have to be in each other's neighborhood?
+#' @param ... further arguments are currently unused.
+#'
+#' @return a integer vector with component assignments.
+#'
+#' @author Michael Hahsler
+#' @keywords model
+#' @examples
+#' set.seed(665544)
+#' n <- 100
+#' x <- cbind(
+#'   x=runif(10, 0, 5) + rnorm(n, sd = 0.4),
+#'   y=runif(10, 0, 5) + rnorm(n, sd = 0.4)
+#'   )
+#' plot(x, pch = 16)
+#'
+#' # Connected components on a graph where each pair of points
+#' # with a distance less or equal to eps are connected
+#' d <- dist(x)
+#' components <- comps(d, eps = .8)
+#' plot(x, col = components, pch = 16)
+#'
+#' # Connected components in a fixed radius nearest neighbor graph
+#' # Gives the same result as the threshold on the distances above
+#' frnn <- frNN(x, eps = .8)
+#' components <- comps(frnn)
+#' plot(frnn, data = x, col = components)
+#'
+#' # Connected components on a k nearest neighbors graph
+#' knn <- kNN(x, 3)
+#' components <- comps(knn, mutual = FALSE)
+#' plot(knn, data = x, col = components)
+#'
+#' components <- comps(knn, mutual = TRUE)
+#' plot(knn, data = x, col = components)
+#'
+#' # Connected components in a shared nearest neighbor graph
+#' snn <- sNN(x, k = 10, kt = 5)
+#' components <- comps(snn)
+#' plot(snn, data = x, col = components)
+#' @export comps
+comps <- function(x, ...) UseMethod("comps", x)
+
+#' @rdname comps
+comps.dist <- function(x, eps, ...)
+  stats::cutree(stats::hclust(x, method = "single"), h = eps)
+
+#' @rdname comps
+comps.kNN <- function(x, mutual = FALSE, ...)
+  as.integer(factor(comps_kNN(x$id, as.logical(mutual))))
+
+# sNN and frNN are symmetric so no need for mutual
+#' @rdname comps
+comps.sNN <- function(x, ...) comps.kNN(x, mutual = FALSE)
+
+#' @rdname comps
+comps.frNN <- function(x, ...) comps_frNN(x$id, mutual = FALSE)
diff --git a/R/dbscan.R b/R/dbscan.R
@@ -30,16 +30,19 @@
 #'
 #' **The algorithm**
 #'
-#' This implementation of DBSCAN (Hahsler et al, 2019) follows the original
-#' algorithm as described by Ester et al (1996). DBSCAN estimates the density
-#' around each data point by counting the number of points in a user-specified
-#' eps-neighborhood and applies a used-specified minPts thresholds to identify
-#' core, border and noise points. In a second step, core points are joined into
-#' a cluster if they are density-reachable (i.e., there is a chain of core
-#' points where one falls inside the eps-neighborhood of the next). Finally,
-#' border points are assigned to clusters. The algorithm needs parameters
-#' `eps` (the radius of the epsilon neighborhood) and `minPts` (the
-#' density threshold).
+#' This implementation of DBSCAN follows the original
+#' algorithm as described by Ester et al (1996). DBSCAN performs the following steps:
+#'
+#' 1. Estimate the density
+#'   around each data point by counting the number of points in a user-specified
+#'   eps-neighborhood and applies a used-specified minPts thresholds to identify
+#'   core, border and noise points.
+#' 2. Core points are joined into
+#'   a cluster if they are density-reachable (i.e., there is a chain of core
+#'   points where one falls inside the eps-neighborhood of the next).
+#' 3. Border points are assigned to clusters. The algorithm needs parameters
+#'   `eps` (the radius of the epsilon neighborhood) and `minPts` (the
+#'   density threshold).
 #'
 #' Border points are arbitrarily assigned to clusters in the original
 #' algorithm. DBSCAN* (see Campello et al 2013) treats all border points as
@@ -82,8 +85,8 @@
 #' [predict()] can be used to predict cluster memberships for new data
 #' points. A point is considered a member of a cluster if it is within the eps
 #' neighborhood of a member of the cluster (Euclidean distance is used). Points
-#' which cannot be assigned to a cluster will be reported as members of the
-#' noise cluster 0.
+#' which cannot be assigned to a cluster will be reported as
+#' noise points (i.e., cluster ID 0).
 #'
 #' @aliases dbscan DBSCAN print.dbscan_fast
 #' @family clustering functions
@@ -103,12 +106,15 @@
 #' neighbor search algorithm. See [frNN()] for details on how to
 #' control the search strategy.
 #'
-#' @return An object of class `dbscan_fast` with the following components:
+#' @return `dbscan()` returns an object of class `dbscan_fast` with the following components:
 #'
 #' \item{eps }{ value of the `eps` parameter.}
 #' \item{minPts }{ value of the `minPts` parameter.}
 #' \item{cluster }{A integer vector with cluster assignments. Zero indicates noise points.}
 #'
+#' `is.corepoint()` returns a logical vector indicating for each data point if it is a
+#'   core point.
+#'
 #' @author Michael Hahsler
 #' @references Hahsler M, Piekenbrock M, Doran D (2019). dbscan: Fast
 #' Density-Based Clustering with R.  _Journal of Statistical Software,_
@@ -365,3 +371,7 @@ print.dbscan_fast <- function(x, ...) {
     paste(names(x), collapse = ", ")
   ), exdent = 18))
 }
+
+#' @rdname dbscan
+is.corepoint <- function(x, eps, minPts = 5, ...)
+  sapply(frNN(x, eps = 0.5, ...)$id, length) >= (minPts - 1)