From 6c54753a72fa4eca05d1afa2e9de2ce7aa989c49 Mon Sep 17 00:00:00 2001 From: James Melville Date: Fri, 17 May 2024 21:32:12 -0700 Subject: [PATCH] #23 expose random seed --- NEWS.md | 7 +++++++ R/hnsw.R | 3 ++- README.md | 2 ++ src/hnsw.cpp | 36 ++++++++++++++++++++++++++--------- tests/testthat/test_verbose.R | 4 ++++ 5 files changed, 42 insertions(+), 10 deletions(-) diff --git a/NEWS.md b/NEWS.md index 8bf10d6..fbf1037 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,6 +9,13 @@ fixes an issue where if you created an index with `hnsw_build` and to reload the index and have it find Euclidean distances. You would have to create it as an `HsnwL2` object and take the square root of the distances yourself (). +* The `Hnsw` constructors now expose a `random_seed` parameter that you can use +to set the random seed used in constructing the HNSW index. Internally, the +`hnsw_build` and `hnsw_knn` functions will use a random seed based on R's RNG +state. This means that if you want to reproduce results, you need to set the +random seed in R via `set.seed` before calling those functions. Based on a +request by [Maciej Beręsewicz](https://github.com/BERENZ) +(). # RcppHNSW 0.6.0 diff --git a/R/hnsw.R b/R/hnsw.R index cb6a266..1a99e4d 100644 --- a/R/hnsw.R +++ b/R/hnsw.R @@ -218,7 +218,8 @@ hnsw_build <- function(X, ) # Create the indexing object. You must say up front the number of items that # will be stored (nitems). - ann <- methods::new(clazz, ndim, nitems, M, ef) + seed <- floor(stats::runif(1, min = 0, max = 2147483647L + 1)) + ann <- methods::new(clazz, ndim, nitems, M, ef, seed) tsmessage( "Building HNSW index with metric '", diff --git a/README.md b/README.md index 5404a76..c7c4f06 100644 --- a/README.md +++ b/README.md @@ -280,6 +280,8 @@ with `dim` dimensions and a maximum size of `max_elements` items. `ef` and `M` determine the speed vs accuracy trade off. Other classes for different distances are: `HnswCosine` for the cosine distance and `HnswIp` for the "Inner Product" distance (like the cosine distance without normalizing). +* `new(HnswL2, dim, max_elements, M, ef_contruction, random_seed)` same as the +previous constructor, but with a specified random seed. * `new(HnswL2, dim, filename)` load a previously saved index (see `save` below) with `dim` dimensions from the specified `filename`. * `new(HnswL2, dim, filename, max_elements)` load a previously saved index (see diff --git a/src/hnsw.cpp b/src/hnsw.cpp index 23722c4..7ea7f9f 100644 --- a/src/hnsw.cpp +++ b/src/hnsw.cpp @@ -51,11 +51,9 @@ template struct Normalizer { } }; - struct NoDistanceProcess { template - static void process_distances(std::vector &vec) { - } + static void process_distances(std::vector &vec) {} }; struct SquareRootDistanceProcess { @@ -67,8 +65,8 @@ struct SquareRootDistanceProcess { } }; - -template +template class Hnsw { static const constexpr std::size_t M_DEFAULT = 16; static const constexpr std::size_t EF_CONSTRUCTION_DEFAULT = 200; @@ -91,6 +89,14 @@ class Hnsw { new hnswlib::HierarchicalNSW(space.get(), max_elements, M, ef_construction))) {} + Hnsw(int dim, std::size_t max_elements, std::size_t M, + std::size_t ef_construction, std::size_t random_seed) + : dim(dim), normalize(false), cur_l(0), numThreads(0), grainSize(1), + space(std::unique_ptr(new Distance(dim))), + appr_alg(std::unique_ptr>( + new hnswlib::HierarchicalNSW( + space.get(), max_elements, M, ef_construction, random_seed))) {} + Hnsw(int dim, const std::string &path_to_index) : dim(dim), normalize(false), cur_l(0), numThreads(0), grainSize(1), space(std::unique_ptr(new Distance(dim))), @@ -521,15 +527,20 @@ class Hnsw { }; using HnswL2 = Hnsw; -using HnswCosine = Hnsw; -using HnswIp = Hnsw; -using HnswEuclidean = Hnsw; +using HnswCosine = + Hnsw; +using HnswIp = + Hnsw; +using HnswEuclidean = + Hnsw; RCPP_EXPOSED_CLASS_NODECL(HnswL2) RCPP_MODULE(HnswL2) { Rcpp::class_("HnswL2") .constructor( "constructor with dimension, number of items, M, ef") + .constructor( + "constructor with dimension, number of items, M, ef, random seed") .constructor( "constructor with dimension, loading from filename") .constructor( @@ -580,6 +591,8 @@ RCPP_MODULE(HnswCosine) { Rcpp::class_("HnswCosine") .constructor( "constructor with dimension, number of items, M, ef") + .constructor( + "constructor with dimension, number of items, M, ef, random seed") .constructor( "constructor with dimension, loading from filename") .constructor( @@ -630,6 +643,8 @@ RCPP_MODULE(HnswIp) { Rcpp::class_("HnswIp") .constructor( "constructor with dimension, number of items, M, ef") + .constructor( + "constructor with dimension, number of items, M, ef, random seed") .constructor( "constructor with dimension, loading from filename") .constructor( @@ -680,6 +695,8 @@ RCPP_MODULE(HnswEuclidean) { Rcpp::class_("HnswEuclidean") .constructor( "constructor with dimension, number of items, M, ef") + .constructor( + "constructor with dimension, number of items, M, ef, random seed") .constructor( "constructor with dimension, loading from filename") .constructor( @@ -714,7 +731,8 @@ RCPP_MODULE(HnswEuclidean) { "retrieve Nearest Neigbours given matrix where items are stored " "column-wise. Nearest Neighbors data is also returned " "column-wise") - .method("size", &HnswEuclidean::size, "number of items added to the index") + .method("size", &HnswEuclidean::size, + "number of items added to the index") .method("setNumThreads", &HnswEuclidean::setNumThreads, "set the number of threads to use") .method("setGrainSize", &HnswEuclidean::setGrainSize, diff --git a/tests/testthat/test_verbose.R b/tests/testthat/test_verbose.R index 8d854b9..cb9f51a 100644 --- a/tests/testthat/test_verbose.R +++ b/tests/testthat/test_verbose.R @@ -5,8 +5,12 @@ set.seed(1337) data <- matrix(rnorm(100 * 10), nrow = 100) # results should be the same +# 23: also counts as a test of setting the seed to get repeatable results +set.seed(1337) res_p <- hnsw_knn(data, k = 10, distance = "euclidean", verbose = TRUE, progress = "bar") +set.seed(1337) res_v <- hnsw_knn(data, k = 10, distance = "euclidean", verbose = TRUE, progress = NULL) +set.seed(1337) res_q <- hnsw_knn(data, k = 10, distance = "euclidean", verbose = FALSE) expect_equal(sum(res_p$idx - res_v$idx), 0) expect_equal(sum(res_p$idx - res_q$idx), 0)