diff --git a/NEWS.md b/NEWS.md index 12ecefc..8bf10d6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,15 @@ +# RcppHNSW (development version) + +## New features + +* New class: `HnswEuclidean`. This uses Euclidean distances internally and will +be returned from `hnsw_build` when `distance = "euclidean"` is specified. This +fixes an issue where if you created an index with `hnsw_build` and +`distance = "euclidean"` (the default), then after saving, you would be unable +to reload the index and have it find Euclidean distances. You would have to +create it as an `HsnwL2` object and take the square root of the distances +yourself (). + # RcppHNSW 0.6.0 ## New features diff --git a/R/hnsw.R b/R/hnsw.R index 836c376..cb6a266 100644 --- a/R/hnsw.R +++ b/R/hnsw.R @@ -174,7 +174,8 @@ hnsw_knn <- function(X, #' to be indexed are stored in each row. Otherwise, the items are stored in #' the columns of `X`. Storing items in each column reduces the overhead of #' copying data to a form that can be indexed by the `hnsw` library. -#' @return an instance of a `HnswL2`, `HnswCosine` or `HnswIp` class. +#' @return an instance of an `HnswEuclidean`, `HnswL2`, `HnswCosine` or +#' `HnswIp` class. #' @examples #' irism <- as.matrix(iris[, -5]) #' ann <- hnsw_build(irism) @@ -211,7 +212,7 @@ hnsw_build <- function(X, } clazz <- switch(distance, "l2" = RcppHNSW::HnswL2, - "euclidean" = RcppHNSW::HnswL2, + "euclidean" = RcppHNSW::HnswEuclidean, "cosine" = RcppHNSW::HnswCosine, "ip" = RcppHNSW::HnswIp ) @@ -219,10 +220,6 @@ hnsw_build <- function(X, # will be stored (nitems). ann <- methods::new(clazz, ndim, nitems, M, ef) - if (distance == "euclidean") { - attr(ann, "distance") <- "euclidean" - } - tsmessage( "Building HNSW index with metric '", distance, @@ -253,8 +250,8 @@ hnsw_build <- function(X, #' @param X A numeric matrix of data to search for neighbors. If `byrow = TRUE` #' (the default) then each row of `X` is an item to be searched. Otherwise, #' each item should be stored in the columns of `X`. -#' @param ann an instance of a `HnswL2`, `HnswCosine` or `HnswIp` -#' class. +#' @param ann an instance of an `HnswEuclidean`, `HnswL2`, `HnswCosine` or +#' `HnswIp` class. #' @param k Number of neighbors to return. This can't be larger than the number #' of items that were added to the index `ann`. To check the size of the #' index, call `ann$size()`. @@ -336,11 +333,6 @@ hnsw_search <- } dist <- res$distance - if (!is.null(attr(ann, "distance")) && - attr(ann, "distance") == "euclidean") { - dist <- sqrt(dist) - } - tsmessage("Finished searching") list(idx = res$item, dist = dist) } diff --git a/R/rcpphnsw-package.R b/R/rcpphnsw-package.R index 4a643d0..f35f5b7 100644 --- a/R/rcpphnsw-package.R +++ b/R/rcpphnsw-package.R @@ -8,7 +8,8 @@ #' @docType package #' @name RcppHnsw-package #' @aliases HnswL2 Rcpp_HnswL2-class HnswCosine Rcpp_HnswCosine-class HnswIp -#' @aliases Rcpp_HnswIp-class RcppHNSW-package +#' @aliases Rcpp_HnswIp-class HnswEuclidean Rcpp_HnswEuclidean-class +#' @aliases RcppHNSW-package #' @references #' #' @author James Melville for the R interface; Yury Malkov for hnswlib itself. @@ -25,6 +26,7 @@ Rcpp::loadModule("HnswL2", TRUE) Rcpp::loadModule("HnswCosine", TRUE) Rcpp::loadModule("HnswIp", TRUE) +Rcpp::loadModule("HnswEuclidean", TRUE) .onUnload <- function(libpath) { library.dynam.unload("RcppHNSW", libpath) diff --git a/README.md b/README.md index 7c166b4..5404a76 100644 --- a/README.md +++ b/README.md @@ -182,6 +182,7 @@ all(ann$getNNs(data[1, ], 4) == ann4$getNNs(data[1, ], 4)) # other distance classes: # Cosine: HnswCosine # Inner Product: HnswIP +# Euclidean: HnswEuclidean ``` Here's a rough equivalent of the serialization/deserialization example from diff --git a/man/RcppHnsw-package.Rd b/man/RcppHnsw-package.Rd index daab253..73fd972 100644 --- a/man/RcppHnsw-package.Rd +++ b/man/RcppHnsw-package.Rd @@ -10,6 +10,8 @@ \alias{Rcpp_HnswCosine-class} \alias{HnswIp} \alias{Rcpp_HnswIp-class} +\alias{HnswEuclidean} +\alias{Rcpp_HnswEuclidean-class} \alias{RcppHNSW-package} \title{Rcpp bindings for the hnswlib C++ library for approximate nearest neighbors.} \description{ diff --git a/man/hnsw_build.Rd b/man/hnsw_build.Rd index 85c8968..a9715aa 100644 --- a/man/hnsw_build.Rd +++ b/man/hnsw_build.Rd @@ -60,7 +60,8 @@ the columns of \code{X}. Storing items in each column reduces the overhead of copying data to a form that can be indexed by the \code{hnsw} library.} } \value{ -an instance of a \code{HnswL2}, \code{HnswCosine} or \code{HnswIp} class. +an instance of an \code{HnswEuclidean}, \code{HnswL2}, \code{HnswCosine} or +\code{HnswIp} class. } \description{ Build an hnswlib nearest neighbor index diff --git a/man/hnsw_search.Rd b/man/hnsw_search.Rd index ad501a7..2a1f6e3 100644 --- a/man/hnsw_search.Rd +++ b/man/hnsw_search.Rd @@ -21,8 +21,8 @@ hnsw_search( (the default) then each row of \code{X} is an item to be searched. Otherwise, each item should be stored in the columns of \code{X}.} -\item{ann}{an instance of a \code{HnswL2}, \code{HnswCosine} or \code{HnswIp} -class.} +\item{ann}{an instance of an \code{HnswEuclidean}, \code{HnswL2}, \code{HnswCosine} or +\code{HnswIp} class.} \item{k}{Number of neighbors to return. This can't be larger than the number of items that were added to the index \code{ann}. To check the size of the diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index c5acae7..d00514e 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -14,11 +14,13 @@ Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); RcppExport SEXP _rcpp_module_boot_HnswL2(); RcppExport SEXP _rcpp_module_boot_HnswCosine(); RcppExport SEXP _rcpp_module_boot_HnswIp(); +RcppExport SEXP _rcpp_module_boot_HnswEuclidean(); static const R_CallMethodDef CallEntries[] = { {"_rcpp_module_boot_HnswL2", (DL_FUNC) &_rcpp_module_boot_HnswL2, 0}, {"_rcpp_module_boot_HnswCosine", (DL_FUNC) &_rcpp_module_boot_HnswCosine, 0}, {"_rcpp_module_boot_HnswIp", (DL_FUNC) &_rcpp_module_boot_HnswIp, 0}, + {"_rcpp_module_boot_HnswEuclidean", (DL_FUNC) &_rcpp_module_boot_HnswEuclidean, 0}, {NULL, NULL, 0} }; diff --git a/src/hnsw.cpp b/src/hnsw.cpp index 4013020..23722c4 100644 --- a/src/hnsw.cpp +++ b/src/hnsw.cpp @@ -51,7 +51,24 @@ template struct Normalizer { } }; -template + +struct NoDistanceProcess { + template + static void process_distances(std::vector &vec) { + } +}; + +struct SquareRootDistanceProcess { + template + static void process_distances(std::vector &vec) { + for (std::size_t i = 0; i < vec.size(); i++) { + vec[i] = std::sqrt(vec[i]); + } + } +}; + + +template class Hnsw { static const constexpr std::size_t M_DEFAULT = 16; static const constexpr std::size_t EF_CONSTRUCTION_DEFAULT = 200; @@ -190,6 +207,7 @@ class Hnsw { auto nbr_list = Rcpp::List::create(Rcpp::Named("item") = nbr_labels); if (include_distances) { + DistanceProcess::process_distances(distances); nbr_list["distance"] = distances; } return nbr_list; @@ -322,6 +340,7 @@ class Hnsw { Rcpp::Named("item") = Rcpp::IntegerMatrix( nitems, static_cast(nnbrs), idx_vec.begin())); if (include_distances) { + DistanceProcess::process_distances(dist_vec); result["distance"] = Rcpp::NumericMatrix(nitems, static_cast(nnbrs), dist_vec.begin()); } @@ -367,6 +386,7 @@ class Hnsw { Rcpp::Named("item") = Rcpp::IntegerMatrix(static_cast(nnbrs), nitems, idx_vec.begin())); if (include_distances) { + DistanceProcess::process_distances(dist_vec); result["distance"] = Rcpp::NumericMatrix(static_cast(nnbrs), nitems, dist_vec.begin()); } @@ -500,9 +520,10 @@ class Hnsw { std::unique_ptr> appr_alg; }; -using HnswL2 = Hnsw; -using HnswCosine = Hnsw; -using HnswIp = Hnsw; +using HnswL2 = Hnsw; +using HnswCosine = Hnsw; +using HnswIp = Hnsw; +using HnswEuclidean = Hnsw; RCPP_EXPOSED_CLASS_NODECL(HnswL2) RCPP_MODULE(HnswL2) { @@ -653,3 +674,53 @@ RCPP_MODULE(HnswIp) { .method("resizeIndex", &HnswIp::resizeIndex, "resize the index to use this number of items"); } + +RCPP_EXPOSED_CLASS_NODECL(HnswEuclidean) +RCPP_MODULE(HnswEuclidean) { + Rcpp::class_("HnswEuclidean") + .constructor( + "constructor with dimension, number of items, M, ef") + .constructor( + "constructor with dimension, loading from filename") + .constructor( + "constructor with dimension, loading from filename, number of items") + .method("setEf", &HnswEuclidean::setEf, "set ef value") + .method("addItem", &HnswEuclidean::addItem, "add item") + .method("addItems", &HnswEuclidean::addItems, + "add items where each item is stored row-wise") + .method("addItemsCol", &HnswEuclidean::addItemsCol, + "add items where each item is stored column-wise") + .method("getItems", &HnswEuclidean::getItems, + "returns a matrix of vectors with the integer identifiers " + "specified in ids vector. " + "Note that for cosine similarity, " + "normalized vectors are returned") + .method("save", &HnswEuclidean::callSave, "save index to file") + .method("getNNs", &HnswEuclidean::getNNs, + "retrieve Nearest Neigbours given vector") + .method("getNNsList", &HnswEuclidean::getNNsList, + "retrieve Nearest Neigbours given vector") + .method("getAllNNs", &HnswEuclidean::getAllNNs, + "retrieve Nearest Neigbours given matrix where items are stored " + "row-wise") + .method("getAllNNsList", &HnswEuclidean::getAllNNsList, + "retrieve Nearest Neigbours given matrix where items are stored " + "row-wise") + .method("getAllNNsCol", &HnswEuclidean::getAllNNsCol, + "retrieve Nearest Neigbours given matrix where items are stored " + "column-wise. Nearest Neighbors data is also returned " + "column-wise") + .method("getAllNNsListCol", &HnswEuclidean::getAllNNsListCol, + "retrieve Nearest Neigbours given matrix where items are stored " + "column-wise. Nearest Neighbors data is also returned " + "column-wise") + .method("size", &HnswEuclidean::size, "number of items added to the index") + .method("setNumThreads", &HnswEuclidean::setNumThreads, + "set the number of threads to use") + .method("setGrainSize", &HnswEuclidean::setGrainSize, + "set minimum grain size for using multiple threads") + .method("markDeleted", &HnswEuclidean::markDeleted, + "remove the item with the specified label from the index") + .method("resizeIndex", &HnswEuclidean::resizeIndex, + "resize the index to use this number of items"); +} diff --git a/tests/testthat/test_save_load.R b/tests/testthat/test_save_load.R index 64c14bc..63a6fb2 100644 --- a/tests/testthat/test_save_load.R +++ b/tests/testthat/test_save_load.R @@ -45,3 +45,20 @@ for (i in 1:num_elements) { } expect_equal(nn4idx, nn4idx_afterload) expect_equal(nn4dist, nn4dist_afterload) + +# 21: no way to use hnsw_search to get Euclidean distances after save/load +test_that("euclidean search is more consistent with save/load", { + ann <- hnsw_build(ui10, distance = "euclidean") + iris_nn <- hnsw_search(ui10, ann, k = 4) + expect_equal(iris_nn$dist, self_nn_dist4, tol = 1e-6) + expect_equal(iris_nn$idx, self_nn_index4) + + temp_file <- tempfile() + on.exit(unlink(temp_file), add = TRUE) + ann$save(temp_file) + + ann2 <- methods::new(RcppHNSW::HnswEuclidean, 4, temp_file) + iris_nn2 <- hnsw_search(ui10, ann, k = 4) + expect_equal(iris_nn2$dist, self_nn_dist4, tol = 1e-6) + expect_equal(iris_nn2$idx, self_nn_index4) +}) diff --git a/tests/testthat/test_search_one.R b/tests/testthat/test_search_one.R index 9591227..31acead 100644 --- a/tests/testthat/test_search_one.R +++ b/tests/testthat/test_search_one.R @@ -22,7 +22,7 @@ expect_equal(index$getNNs(ui10[1, ], 4), self_nn_index4[1, ]) expect_equal(index$getNNsList(ui10[1, ], 4, FALSE)$item, self_nn_index4[1, ]) nbrs_with_distances <- index$getNNsList(ui10[1, ], 4, TRUE) expect_equal(nbrs_with_distances$item, self_nn_index4[1, ]) -expect_equal(nbrs_with_distances$distance, self_nn_dist4[1, ]^2, tol = 1e-6) +expect_equal(nbrs_with_distances$distance, self_nn_dist4[1, ], tol = 1e-6) expect_error(index$getNNsList(ui10[1, ], 15, FALSE), "(?i)unable to find") # Test deletion