Skip to content

Commit

Permalink
#21: add HnswEuclidean class
Browse files Browse the repository at this point in the history
  • Loading branch information
jlmelville committed Mar 11, 2024
1 parent 2b25140 commit 2c412ae
Show file tree
Hide file tree
Showing 11 changed files with 122 additions and 22 deletions.
12 changes: 12 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
# RcppHNSW (development version)

## New features

* New class: `HnswEuclidean`. This uses Euclidean distances internally and will
be returned from `hnsw_build` when `distance = "euclidean"` is specified. This
fixes an issue where if you created an index with `hnsw_build` and
`distance = "euclidean"` (the default), then after saving, you would be unable
to reload the index and have it find Euclidean distances. You would have to
create it as an `HsnwL2` object and take the square root of the distances
yourself (<https://github.com/jlmelville/rcpphnsw/issues/21>).

# RcppHNSW 0.6.0

## New features
Expand Down
18 changes: 5 additions & 13 deletions R/hnsw.R
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,8 @@ hnsw_knn <- function(X,
#' to be indexed are stored in each row. Otherwise, the items are stored in
#' the columns of `X`. Storing items in each column reduces the overhead of
#' copying data to a form that can be indexed by the `hnsw` library.
#' @return an instance of a `HnswL2`, `HnswCosine` or `HnswIp` class.
#' @return an instance of an `HnswEuclidean`, `HnswL2`, `HnswCosine` or
#' `HnswIp` class.
#' @examples
#' irism <- as.matrix(iris[, -5])
#' ann <- hnsw_build(irism)
Expand Down Expand Up @@ -211,18 +212,14 @@ hnsw_build <- function(X,
}
clazz <- switch(distance,
"l2" = RcppHNSW::HnswL2,
"euclidean" = RcppHNSW::HnswL2,
"euclidean" = RcppHNSW::HnswEuclidean,
"cosine" = RcppHNSW::HnswCosine,
"ip" = RcppHNSW::HnswIp
)
# Create the indexing object. You must say up front the number of items that
# will be stored (nitems).
ann <- methods::new(clazz, ndim, nitems, M, ef)

if (distance == "euclidean") {
attr(ann, "distance") <- "euclidean"
}

tsmessage(
"Building HNSW index with metric '",
distance,
Expand Down Expand Up @@ -253,8 +250,8 @@ hnsw_build <- function(X,
#' @param X A numeric matrix of data to search for neighbors. If `byrow = TRUE`
#' (the default) then each row of `X` is an item to be searched. Otherwise,
#' each item should be stored in the columns of `X`.
#' @param ann an instance of a `HnswL2`, `HnswCosine` or `HnswIp`
#' class.
#' @param ann an instance of an `HnswEuclidean`, `HnswL2`, `HnswCosine` or
#' `HnswIp` class.
#' @param k Number of neighbors to return. This can't be larger than the number
#' of items that were added to the index `ann`. To check the size of the
#' index, call `ann$size()`.
Expand Down Expand Up @@ -336,11 +333,6 @@ hnsw_search <-
}

dist <- res$distance
if (!is.null(attr(ann, "distance")) &&
attr(ann, "distance") == "euclidean") {
dist <- sqrt(dist)
}

tsmessage("Finished searching")
list(idx = res$item, dist = dist)
}
4 changes: 3 additions & 1 deletion R/rcpphnsw-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
#' @docType package
#' @name RcppHnsw-package
#' @aliases HnswL2 Rcpp_HnswL2-class HnswCosine Rcpp_HnswCosine-class HnswIp
#' @aliases Rcpp_HnswIp-class RcppHNSW-package
#' @aliases Rcpp_HnswIp-class HnswEuclidean Rcpp_HnswEuclidean-class
#' @aliases RcppHNSW-package
#' @references
#' <https://github.com/nmslib/hnswlib>
#' @author James Melville for the R interface; Yury Malkov for hnswlib itself.
Expand All @@ -25,6 +26,7 @@
Rcpp::loadModule("HnswL2", TRUE)
Rcpp::loadModule("HnswCosine", TRUE)
Rcpp::loadModule("HnswIp", TRUE)
Rcpp::loadModule("HnswEuclidean", TRUE)

.onUnload <- function(libpath) {
library.dynam.unload("RcppHNSW", libpath)
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ all(ann$getNNs(data[1, ], 4) == ann4$getNNs(data[1, ], 4))
# other distance classes:
# Cosine: HnswCosine
# Inner Product: HnswIP
# Euclidean: HnswEuclidean
```

Here's a rough equivalent of the serialization/deserialization example from
Expand Down
2 changes: 2 additions & 0 deletions man/RcppHnsw-package.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion man/hnsw_build.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/hnsw_search.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@ Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
RcppExport SEXP _rcpp_module_boot_HnswL2();
RcppExport SEXP _rcpp_module_boot_HnswCosine();
RcppExport SEXP _rcpp_module_boot_HnswIp();
RcppExport SEXP _rcpp_module_boot_HnswEuclidean();

static const R_CallMethodDef CallEntries[] = {
{"_rcpp_module_boot_HnswL2", (DL_FUNC) &_rcpp_module_boot_HnswL2, 0},
{"_rcpp_module_boot_HnswCosine", (DL_FUNC) &_rcpp_module_boot_HnswCosine, 0},
{"_rcpp_module_boot_HnswIp", (DL_FUNC) &_rcpp_module_boot_HnswIp, 0},
{"_rcpp_module_boot_HnswEuclidean", (DL_FUNC) &_rcpp_module_boot_HnswEuclidean, 0},
{NULL, NULL, 0}
};

Expand Down
79 changes: 75 additions & 4 deletions src/hnsw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,24 @@ template <typename dist_t> struct Normalizer<dist_t, true> {
}
};

template <typename dist_t, typename Distance, bool DoNormalize = false>

struct NoDistanceProcess {
template <typename dist_t>
static void process_distances(std::vector<dist_t> &vec) {
}
};

struct SquareRootDistanceProcess {
template <typename dist_t>
static void process_distances(std::vector<dist_t> &vec) {
for (std::size_t i = 0; i < vec.size(); i++) {
vec[i] = std::sqrt(vec[i]);
}
}
};


template <typename dist_t, typename Distance, bool DoNormalize, typename DistanceProcess>
class Hnsw {
static const constexpr std::size_t M_DEFAULT = 16;
static const constexpr std::size_t EF_CONSTRUCTION_DEFAULT = 200;
Expand Down Expand Up @@ -190,6 +207,7 @@ class Hnsw {

auto nbr_list = Rcpp::List::create(Rcpp::Named("item") = nbr_labels);
if (include_distances) {
DistanceProcess::process_distances(distances);
nbr_list["distance"] = distances;
}
return nbr_list;
Expand Down Expand Up @@ -322,6 +340,7 @@ class Hnsw {
Rcpp::Named("item") = Rcpp::IntegerMatrix(
nitems, static_cast<int>(nnbrs), idx_vec.begin()));
if (include_distances) {
DistanceProcess::process_distances(dist_vec);
result["distance"] = Rcpp::NumericMatrix(nitems, static_cast<int>(nnbrs),
dist_vec.begin());
}
Expand Down Expand Up @@ -367,6 +386,7 @@ class Hnsw {
Rcpp::Named("item") = Rcpp::IntegerMatrix(static_cast<int>(nnbrs),
nitems, idx_vec.begin()));
if (include_distances) {
DistanceProcess::process_distances(dist_vec);
result["distance"] = Rcpp::NumericMatrix(static_cast<int>(nnbrs), nitems,
dist_vec.begin());
}
Expand Down Expand Up @@ -500,9 +520,10 @@ class Hnsw {
std::unique_ptr<hnswlib::HierarchicalNSW<dist_t>> appr_alg;
};

using HnswL2 = Hnsw<float, hnswlib::L2Space, false>;
using HnswCosine = Hnsw<float, hnswlib::InnerProductSpace, true>;
using HnswIp = Hnsw<float, hnswlib::InnerProductSpace, false>;
using HnswL2 = Hnsw<float, hnswlib::L2Space, false, NoDistanceProcess>;
using HnswCosine = Hnsw<float, hnswlib::InnerProductSpace, true, NoDistanceProcess>;
using HnswIp = Hnsw<float, hnswlib::InnerProductSpace, false, NoDistanceProcess>;
using HnswEuclidean = Hnsw<float, hnswlib::L2Space, false, SquareRootDistanceProcess>;

RCPP_EXPOSED_CLASS_NODECL(HnswL2)
RCPP_MODULE(HnswL2) {
Expand Down Expand Up @@ -653,3 +674,53 @@ RCPP_MODULE(HnswIp) {
.method("resizeIndex", &HnswIp::resizeIndex,
"resize the index to use this number of items");
}

RCPP_EXPOSED_CLASS_NODECL(HnswEuclidean)
RCPP_MODULE(HnswEuclidean) {
Rcpp::class_<HnswEuclidean>("HnswEuclidean")
.constructor<int32_t, std::size_t, std::size_t, std::size_t>(
"constructor with dimension, number of items, M, ef")
.constructor<int32_t, std::string>(
"constructor with dimension, loading from filename")
.constructor<int32_t, std::string, std::size_t>(
"constructor with dimension, loading from filename, number of items")
.method("setEf", &HnswEuclidean::setEf, "set ef value")
.method("addItem", &HnswEuclidean::addItem, "add item")
.method("addItems", &HnswEuclidean::addItems,
"add items where each item is stored row-wise")
.method("addItemsCol", &HnswEuclidean::addItemsCol,
"add items where each item is stored column-wise")
.method("getItems", &HnswEuclidean::getItems,
"returns a matrix of vectors with the integer identifiers "
"specified in ids vector. "
"Note that for cosine similarity, "
"normalized vectors are returned")
.method("save", &HnswEuclidean::callSave, "save index to file")
.method("getNNs", &HnswEuclidean::getNNs,
"retrieve Nearest Neigbours given vector")
.method("getNNsList", &HnswEuclidean::getNNsList,
"retrieve Nearest Neigbours given vector")
.method("getAllNNs", &HnswEuclidean::getAllNNs,
"retrieve Nearest Neigbours given matrix where items are stored "
"row-wise")
.method("getAllNNsList", &HnswEuclidean::getAllNNsList,
"retrieve Nearest Neigbours given matrix where items are stored "
"row-wise")
.method("getAllNNsCol", &HnswEuclidean::getAllNNsCol,
"retrieve Nearest Neigbours given matrix where items are stored "
"column-wise. Nearest Neighbors data is also returned "
"column-wise")
.method("getAllNNsListCol", &HnswEuclidean::getAllNNsListCol,
"retrieve Nearest Neigbours given matrix where items are stored "
"column-wise. Nearest Neighbors data is also returned "
"column-wise")
.method("size", &HnswEuclidean::size, "number of items added to the index")
.method("setNumThreads", &HnswEuclidean::setNumThreads,
"set the number of threads to use")
.method("setGrainSize", &HnswEuclidean::setGrainSize,
"set minimum grain size for using multiple threads")
.method("markDeleted", &HnswEuclidean::markDeleted,
"remove the item with the specified label from the index")
.method("resizeIndex", &HnswEuclidean::resizeIndex,
"resize the index to use this number of items");
}
17 changes: 17 additions & 0 deletions tests/testthat/test_save_load.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,20 @@ for (i in 1:num_elements) {
}
expect_equal(nn4idx, nn4idx_afterload)
expect_equal(nn4dist, nn4dist_afterload)

# 21: no way to use hnsw_search to get Euclidean distances after save/load
test_that("euclidean search is more consistent with save/load", {
ann <- hnsw_build(ui10, distance = "euclidean")
iris_nn <- hnsw_search(ui10, ann, k = 4)
expect_equal(iris_nn$dist, self_nn_dist4, tol = 1e-6)
expect_equal(iris_nn$idx, self_nn_index4)

temp_file <- tempfile()
on.exit(unlink(temp_file), add = TRUE)
ann$save(temp_file)

ann2 <- methods::new(RcppHNSW::HnswEuclidean, 4, temp_file)
iris_nn2 <- hnsw_search(ui10, ann, k = 4)
expect_equal(iris_nn2$dist, self_nn_dist4, tol = 1e-6)
expect_equal(iris_nn2$idx, self_nn_index4)
})
2 changes: 1 addition & 1 deletion tests/testthat/test_search_one.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ expect_equal(index$getNNs(ui10[1, ], 4), self_nn_index4[1, ])
expect_equal(index$getNNsList(ui10[1, ], 4, FALSE)$item, self_nn_index4[1, ])
nbrs_with_distances <- index$getNNsList(ui10[1, ], 4, TRUE)
expect_equal(nbrs_with_distances$item, self_nn_index4[1, ])
expect_equal(nbrs_with_distances$distance, self_nn_dist4[1, ]^2, tol = 1e-6)
expect_equal(nbrs_with_distances$distance, self_nn_dist4[1, ], tol = 1e-6)
expect_error(index$getNNsList(ui10[1, ], 15, FALSE), "(?i)unable to find")

# Test deletion
Expand Down

0 comments on commit 2c412ae

Please sign in to comment.