Skip to content

Commit

Permalink
Added helper functions and fixed mrd memory leak.
Browse files Browse the repository at this point in the history
  • Loading branch information
mhahsler committed Jan 14, 2022
1 parent 16cda5b commit d347e13
Show file tree
Hide file tree
Showing 34 changed files with 907 additions and 434 deletions.
23 changes: 12 additions & 11 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
Package: dbscan
Version: 1.1-9
Date: 2022-01-10
Title: Density Based Clustering of Applications with Noise (DBSCAN) and Related
Algorithms
Version: 1.1-10
Date: 2022-01-14
Title: Density-Based Spatial Clustering of Applications with Noise (DBSCAN)
and Related Algorithms
Authors@R: c(person("Michael", "Hahsler", role = c("aut", "cre", "cph"),
email = "[email protected]"),
person("Matthew", "Piekenbrock", role = c("aut", "cph")),
person("Sunil", "Arya", role = c("ctb", "cph")),
person("David", "Mount", role = c("ctb", "cph")))
Description: A fast reimplementation of several density-based algorithms of
the DBSCAN family for spatial data. Includes the clustering algorithms
DBSCAN (density-based spatial clustering of applications with noise)
and HDBSCAN (hierarchical DBSCAN), the ordering algorithm
OPTICS (ordering points to identify the clustering structure),
and the outlier detection algorithm LOF (local outlier factor).
The implementations use the kd-tree data structure (from library ANN) for faster k-nearest neighbor search.
An R interface to fast kNN and fixed-radius NN search is also provided.
the DBSCAN family. Includes the clustering algorithms DBSCAN (density-based
spatial clustering of applications with noise) and HDBSCAN (hierarchical
DBSCAN), the ordering algorithm OPTICS (ordering points to identify the
clustering structure), shared nearest neighbor clustering, and the outlier
detection algorithms LOF (local outlier factor) and GLOSH (global-local
outlier score from hierarchies). The implementations use the kd-tree data
structure (from library ANN) for faster k-nearest neighbor search. An R
interface to fast kNN and fixed-radius NN search is also provided.
Hahsler, Piekenbrock and Doran (2019) <doi:10.18637/jss.v091.i01>.
SystemRequirements: C++11
Imports:
Expand Down
25 changes: 22 additions & 3 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,31 +1,45 @@
useDynLib("dbscan", .registration = TRUE)
import("Rcpp")
importFrom("graphics", "plot", "points", "par", "segments",
"lines", "polygon", "text")
"lines", "polygon", "text", "abline")
importFrom("grDevices", "palette", "chull", "adjustcolor")
importFrom("stats", "dist", "hclust", "dendrapply", "as.dendrogram",
"is.leaf", "prcomp")
importFrom("utils", "tail")


export(
adjacencylist,
kNN,
kNNdistplot,
kNNdist,
frNN,
sNN,
adjacencylist,

dbscan,
is.corepoint,

optics,
extractDBSCAN,
extractXi,

hdbscan,
extractFOSC,
as.reachability,
coredist,
mrdist,

sNNclust,

jpclust,

lof,
glosh,
pointdensity,

hullplot,
as.reachability

comps
)

S3method(print, dbscan_fast)
Expand Down Expand Up @@ -59,3 +73,8 @@ S3method(sort, sNN)
S3method(print, sNN)

S3method(plot, NN)

S3method(comps, dist)
S3method(comps, kNN)
S3method(comps, sNN)
S3method(comps, frNN)
13 changes: 13 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# dbscan 1.1-10 (2022-01-14)

## New Features
* is.corepoint() for DBSCAN.
* coredist() and mrdist() for HDBSCAN.
* find connected components with comps().

# Changes
* reachability plot now shows all undefined distances as a dashed line.

## Bugfix
* memory leak in mrd calculation fixed.

# dbscan 1.1-9 (2022-01-10)

## Changes
Expand Down
21 changes: 16 additions & 5 deletions R/NN.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@
#' @family NN functions
#'
#' @param x a `NN` object
#' @param ... further parameters
#' @param pch plotting character.
#' @param col color used for the data points (nodes).
#' @param linecol color used for edges.
#' @param ... further parameters past on to [plot()].
#' @param decreasing sort in decreasing order?
#' @param data that was used to create `x`
#' @param main title
Expand Down Expand Up @@ -81,7 +84,7 @@ sort.NN <- function(x, decreasing = FALSE, ...) {


#' @rdname NN
plot.NN <- function(x, data, main = NULL, ...) {
plot.NN <- function(x, data, main = NULL, pch = 16, col = NULL, linecol = "gray", ...) {
if (is.null(main)) {
if (inherits(x, "frNN"))
main <- paste0("frNN graph (eps = ", x$eps, ")")
Expand All @@ -91,7 +94,9 @@ plot.NN <- function(x, data, main = NULL, ...) {
main <- paste0("Shared NN graph (k=", x$k,
ifelse(is.null(x$kt), "", paste0(", kt=", x$kt)), ")")
}
plot(data[, 1:2], main = main, ...)

## create an empty plot
plot(data[, 1:2], main = main, type = "n", pch = pch, col = col, ...)

id <- adjacencylist(x)

Expand All @@ -101,13 +106,19 @@ plot.NN <- function(x, data, main = NULL, ...) {
for (i in 1:length(id)) {
for (j in 1:length(id[[i]]))
lines(x = c(data[i, 1], data[id[[i]][j], 1]),
y = c(data[i, 2], data[id[[i]][j], 2]),
y = c(data[i, 2], data[id[[i]][j], 2]), col = linecol,
...)
}

## ad vertices
points(data[, 1:2], main = main, pch = pch, col = col, ...)

} else {
## ad vertices
points(data[, 1:2], main = main, pch = pch, ...)
## use colors if it was from a query
for (i in 1:length(id)) {
points(data[id[[i]], ], col = i + 1L)
points(data[id[[i]], ], pch = pch, col = i + 1L)
}
}
}
20 changes: 12 additions & 8 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@ ANN_cleanup <- function() {
invisible(.Call(`_dbscan_ANN_cleanup`))
}

comps_kNN <- function(nn, mutual) {
.Call(`_dbscan_comps_kNN`, nn, mutual)
}

comps_frNN <- function(nn, mutual) {
.Call(`_dbscan_comps_frNN`, nn, mutual)
}

dbscan_int <- function(data, eps, minPts, weights, borderPoints, type, bucketSize, splitRule, approx, frNN) {
.Call(`_dbscan_dbscan_int`, data, eps, minPts, weights, borderPoints, type, bucketSize, splitRule, approx, frNN)
}
Expand Down Expand Up @@ -41,6 +49,10 @@ lof_kNN <- function(data, minPts, type, bucketSize, splitRule, approx) {
.Call(`_dbscan_lof_kNN`, data, minPts, type, bucketSize, splitRule, approx)
}

mrd <- function(dm, cd) {
.Call(`_dbscan_mrd`, dm, cd)
}

optics_int <- function(data, eps, minPts, type, bucketSize, splitRule, approx, frNN) {
.Call(`_dbscan_optics_int`, data, eps, minPts, type, bucketSize, splitRule, approx, frNN)
}
Expand Down Expand Up @@ -101,14 +113,6 @@ mst_to_dendrogram <- function(mst) {
.Call(`_dbscan_mst_to_dendrogram`, mst)
}

mrd <- function(dm, cd) {
.Call(`_dbscan_mrd`, dm, cd)
}

mrd_m <- function(dm, cd) {
.Call(`_dbscan_mrd_m`, dm, cd)
}

coreFromDist <- function(dist, n, minPts) {
.Call(`_dbscan_coreFromDist`, dist, n, minPts)
}
Expand Down
88 changes: 88 additions & 0 deletions R/comps.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#######################################################################
# dbscan - Density Based Clustering of Applications with Noise
# and Related Algorithms
# Copyright (C) 2017 Michael Hahsler

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

#' Find Connected Components in a NN Graph
#'
#' Generic function and methods to find connected components in nearest neighbor graphs.
#'
#' Note that for kNN graphs, one point may be in the kNN of the other but nor vice versa.
#' `mutual = TRUE` requires that both points are in each other's kNN.
#'
#' @family NN functions
#' @aliases components
#'
#' @param x the [NN] object representing the graph or a [dist] object
#' @param eps threshold on the distance
#' @param mutual for a pair of points, do both have to be in each other's neighborhood?
#' @param ... further arguments are currently unused.
#'
#' @return a integer vector with component assignments.
#'
#' @author Michael Hahsler
#' @keywords model
#' @examples
#' set.seed(665544)
#' n <- 100
#' x <- cbind(
#' x=runif(10, 0, 5) + rnorm(n, sd = 0.4),
#' y=runif(10, 0, 5) + rnorm(n, sd = 0.4)
#' )
#' plot(x, pch = 16)
#'
#' # Connected components on a graph where each pair of points
#' # with a distance less or equal to eps are connected
#' d <- dist(x)
#' components <- comps(d, eps = .8)
#' plot(x, col = components, pch = 16)
#'
#' # Connected components in a fixed radius nearest neighbor graph
#' # Gives the same result as the threshold on the distances above
#' frnn <- frNN(x, eps = .8)
#' components <- comps(frnn)
#' plot(frnn, data = x, col = components)
#'
#' # Connected components on a k nearest neighbors graph
#' knn <- kNN(x, 3)
#' components <- comps(knn, mutual = FALSE)
#' plot(knn, data = x, col = components)
#'
#' components <- comps(knn, mutual = TRUE)
#' plot(knn, data = x, col = components)
#'
#' # Connected components in a shared nearest neighbor graph
#' snn <- sNN(x, k = 10, kt = 5)
#' components <- comps(snn)
#' plot(snn, data = x, col = components)
#' @export comps
comps <- function(x, ...) UseMethod("comps", x)

#' @rdname comps
comps.dist <- function(x, eps, ...)
stats::cutree(stats::hclust(x, method = "single"), h = eps)

#' @rdname comps
comps.kNN <- function(x, mutual = FALSE, ...)
as.integer(factor(comps_kNN(x$id, as.logical(mutual))))

# sNN and frNN are symmetric so no need for mutual
#' @rdname comps
comps.sNN <- function(x, ...) comps.kNN(x, mutual = FALSE)

#' @rdname comps
comps.frNN <- function(x, ...) comps_frNN(x$id, mutual = FALSE)
36 changes: 23 additions & 13 deletions R/dbscan.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,19 @@
#'
#' **The algorithm**
#'
#' This implementation of DBSCAN (Hahsler et al, 2019) follows the original
#' algorithm as described by Ester et al (1996). DBSCAN estimates the density
#' around each data point by counting the number of points in a user-specified
#' eps-neighborhood and applies a used-specified minPts thresholds to identify
#' core, border and noise points. In a second step, core points are joined into
#' a cluster if they are density-reachable (i.e., there is a chain of core
#' points where one falls inside the eps-neighborhood of the next). Finally,
#' border points are assigned to clusters. The algorithm needs parameters
#' `eps` (the radius of the epsilon neighborhood) and `minPts` (the
#' density threshold).
#' This implementation of DBSCAN follows the original
#' algorithm as described by Ester et al (1996). DBSCAN performs the following steps:
#'
#' 1. Estimate the density
#' around each data point by counting the number of points in a user-specified
#' eps-neighborhood and applies a used-specified minPts thresholds to identify
#' core, border and noise points.
#' 2. Core points are joined into
#' a cluster if they are density-reachable (i.e., there is a chain of core
#' points where one falls inside the eps-neighborhood of the next).
#' 3. Border points are assigned to clusters. The algorithm needs parameters
#' `eps` (the radius of the epsilon neighborhood) and `minPts` (the
#' density threshold).
#'
#' Border points are arbitrarily assigned to clusters in the original
#' algorithm. DBSCAN* (see Campello et al 2013) treats all border points as
Expand Down Expand Up @@ -82,8 +85,8 @@
#' [predict()] can be used to predict cluster memberships for new data
#' points. A point is considered a member of a cluster if it is within the eps
#' neighborhood of a member of the cluster (Euclidean distance is used). Points
#' which cannot be assigned to a cluster will be reported as members of the
#' noise cluster 0.
#' which cannot be assigned to a cluster will be reported as
#' noise points (i.e., cluster ID 0).
#'
#' @aliases dbscan DBSCAN print.dbscan_fast
#' @family clustering functions
Expand All @@ -103,12 +106,15 @@
#' neighbor search algorithm. See [frNN()] for details on how to
#' control the search strategy.
#'
#' @return An object of class `dbscan_fast` with the following components:
#' @return `dbscan()` returns an object of class `dbscan_fast` with the following components:
#'
#' \item{eps }{ value of the `eps` parameter.}
#' \item{minPts }{ value of the `minPts` parameter.}
#' \item{cluster }{A integer vector with cluster assignments. Zero indicates noise points.}
#'
#' `is.corepoint()` returns a logical vector indicating for each data point if it is a
#' core point.
#'
#' @author Michael Hahsler
#' @references Hahsler M, Piekenbrock M, Doran D (2019). dbscan: Fast
#' Density-Based Clustering with R. _Journal of Statistical Software,_
Expand Down Expand Up @@ -365,3 +371,7 @@ print.dbscan_fast <- function(x, ...) {
paste(names(x), collapse = ", ")
), exdent = 18))
}

#' @rdname dbscan
is.corepoint <- function(x, eps, minPts = 5, ...)
sapply(frNN(x, eps = 0.5, ...)$id, length) >= (minPts - 1)
Loading

0 comments on commit d347e13

Please sign in to comment.