Skip to content

Commit

Permalink
#23 expose random seed
Browse files Browse the repository at this point in the history
  • Loading branch information
jlmelville committed May 18, 2024
1 parent 2c412ae commit 6c54753
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 10 deletions.
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@ fixes an issue where if you created an index with `hnsw_build` and
to reload the index and have it find Euclidean distances. You would have to
create it as an `HsnwL2` object and take the square root of the distances
yourself (<https://github.com/jlmelville/rcpphnsw/issues/21>).
* The `Hnsw` constructors now expose a `random_seed` parameter that you can use
to set the random seed used in constructing the HNSW index. Internally, the
`hnsw_build` and `hnsw_knn` functions will use a random seed based on R's RNG
state. This means that if you want to reproduce results, you need to set the
random seed in R via `set.seed` before calling those functions. Based on a
request by [Maciej Beręsewicz](https://github.com/BERENZ)
(<https://github.com/jlmelville/rcpphnsw/issues/23>).

# RcppHNSW 0.6.0

Expand Down
3 changes: 2 additions & 1 deletion R/hnsw.R
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,8 @@ hnsw_build <- function(X,
)
# Create the indexing object. You must say up front the number of items that
# will be stored (nitems).
ann <- methods::new(clazz, ndim, nitems, M, ef)
seed <- floor(stats::runif(1, min = 0, max = 2147483647L + 1))
ann <- methods::new(clazz, ndim, nitems, M, ef, seed)

tsmessage(
"Building HNSW index with metric '",
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,8 @@ with `dim` dimensions and a maximum size of `max_elements` items. `ef` and `M`
determine the speed vs accuracy trade off. Other classes for different distances
are: `HnswCosine` for the cosine distance and `HnswIp` for the "Inner Product"
distance (like the cosine distance without normalizing).
* `new(HnswL2, dim, max_elements, M, ef_contruction, random_seed)` same as the
previous constructor, but with a specified random seed.
* `new(HnswL2, dim, filename)` load a previously saved index (see `save` below)
with `dim` dimensions from the specified `filename`.
* `new(HnswL2, dim, filename, max_elements)` load a previously saved index (see
Expand Down
36 changes: 27 additions & 9 deletions src/hnsw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,9 @@ template <typename dist_t> struct Normalizer<dist_t, true> {
}
};


struct NoDistanceProcess {
template <typename dist_t>
static void process_distances(std::vector<dist_t> &vec) {
}
static void process_distances(std::vector<dist_t> &vec) {}
};

struct SquareRootDistanceProcess {
Expand All @@ -67,8 +65,8 @@ struct SquareRootDistanceProcess {
}
};


template <typename dist_t, typename Distance, bool DoNormalize, typename DistanceProcess>
template <typename dist_t, typename Distance, bool DoNormalize,
typename DistanceProcess>
class Hnsw {
static const constexpr std::size_t M_DEFAULT = 16;
static const constexpr std::size_t EF_CONSTRUCTION_DEFAULT = 200;
Expand All @@ -91,6 +89,14 @@ class Hnsw {
new hnswlib::HierarchicalNSW<dist_t>(space.get(), max_elements, M,
ef_construction))) {}

Hnsw(int dim, std::size_t max_elements, std::size_t M,
std::size_t ef_construction, std::size_t random_seed)
: dim(dim), normalize(false), cur_l(0), numThreads(0), grainSize(1),
space(std::unique_ptr<Distance>(new Distance(dim))),
appr_alg(std::unique_ptr<hnswlib::HierarchicalNSW<dist_t>>(
new hnswlib::HierarchicalNSW<dist_t>(
space.get(), max_elements, M, ef_construction, random_seed))) {}

Hnsw(int dim, const std::string &path_to_index)
: dim(dim), normalize(false), cur_l(0), numThreads(0), grainSize(1),
space(std::unique_ptr<Distance>(new Distance(dim))),
Expand Down Expand Up @@ -521,15 +527,20 @@ class Hnsw {
};

using HnswL2 = Hnsw<float, hnswlib::L2Space, false, NoDistanceProcess>;
using HnswCosine = Hnsw<float, hnswlib::InnerProductSpace, true, NoDistanceProcess>;
using HnswIp = Hnsw<float, hnswlib::InnerProductSpace, false, NoDistanceProcess>;
using HnswEuclidean = Hnsw<float, hnswlib::L2Space, false, SquareRootDistanceProcess>;
using HnswCosine =
Hnsw<float, hnswlib::InnerProductSpace, true, NoDistanceProcess>;
using HnswIp =
Hnsw<float, hnswlib::InnerProductSpace, false, NoDistanceProcess>;
using HnswEuclidean =
Hnsw<float, hnswlib::L2Space, false, SquareRootDistanceProcess>;

RCPP_EXPOSED_CLASS_NODECL(HnswL2)
RCPP_MODULE(HnswL2) {
Rcpp::class_<HnswL2>("HnswL2")
.constructor<int32_t, std::size_t, std::size_t, std::size_t>(
"constructor with dimension, number of items, M, ef")
.constructor<int32_t, std::size_t, std::size_t, std::size_t, std::size_t>(
"constructor with dimension, number of items, M, ef, random seed")
.constructor<int32_t, std::string>(
"constructor with dimension, loading from filename")
.constructor<int32_t, std::string, std::size_t>(
Expand Down Expand Up @@ -580,6 +591,8 @@ RCPP_MODULE(HnswCosine) {
Rcpp::class_<HnswCosine>("HnswCosine")
.constructor<int32_t, std::size_t, std::size_t, std::size_t>(
"constructor with dimension, number of items, M, ef")
.constructor<int32_t, std::size_t, std::size_t, std::size_t, std::size_t>(
"constructor with dimension, number of items, M, ef, random seed")
.constructor<int32_t, std::string>(
"constructor with dimension, loading from filename")
.constructor<int32_t, std::string, std::size_t>(
Expand Down Expand Up @@ -630,6 +643,8 @@ RCPP_MODULE(HnswIp) {
Rcpp::class_<HnswIp>("HnswIp")
.constructor<int32_t, std::size_t, std::size_t, std::size_t>(
"constructor with dimension, number of items, M, ef")
.constructor<int32_t, std::size_t, std::size_t, std::size_t, std::size_t>(
"constructor with dimension, number of items, M, ef, random seed")
.constructor<int32_t, std::string>(
"constructor with dimension, loading from filename")
.constructor<int32_t, std::string, std::size_t>(
Expand Down Expand Up @@ -680,6 +695,8 @@ RCPP_MODULE(HnswEuclidean) {
Rcpp::class_<HnswEuclidean>("HnswEuclidean")
.constructor<int32_t, std::size_t, std::size_t, std::size_t>(
"constructor with dimension, number of items, M, ef")
.constructor<int32_t, std::size_t, std::size_t, std::size_t, std::size_t>(
"constructor with dimension, number of items, M, ef, random seed")
.constructor<int32_t, std::string>(
"constructor with dimension, loading from filename")
.constructor<int32_t, std::string, std::size_t>(
Expand Down Expand Up @@ -714,7 +731,8 @@ RCPP_MODULE(HnswEuclidean) {
"retrieve Nearest Neigbours given matrix where items are stored "
"column-wise. Nearest Neighbors data is also returned "
"column-wise")
.method("size", &HnswEuclidean::size, "number of items added to the index")
.method("size", &HnswEuclidean::size,
"number of items added to the index")
.method("setNumThreads", &HnswEuclidean::setNumThreads,
"set the number of threads to use")
.method("setGrainSize", &HnswEuclidean::setGrainSize,
Expand Down
4 changes: 4 additions & 0 deletions tests/testthat/test_verbose.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,12 @@ set.seed(1337)
data <- matrix(rnorm(100 * 10), nrow = 100)

# results should be the same
# 23: also counts as a test of setting the seed to get repeatable results
set.seed(1337)
res_p <- hnsw_knn(data, k = 10, distance = "euclidean", verbose = TRUE, progress = "bar")
set.seed(1337)
res_v <- hnsw_knn(data, k = 10, distance = "euclidean", verbose = TRUE, progress = NULL)
set.seed(1337)
res_q <- hnsw_knn(data, k = 10, distance = "euclidean", verbose = FALSE)
expect_equal(sum(res_p$idx - res_v$idx), 0)
expect_equal(sum(res_p$idx - res_q$idx), 0)

0 comments on commit 6c54753

Please sign in to comment.