Skip to content

Commit

Permalink
Merge pull request #20 from jlmelville/loadindex
Browse files Browse the repository at this point in the history
Loadindex
  • Loading branch information
jlmelville authored Mar 10, 2024
2 parents c18b3a2 + 62862e1 commit 2b25140
Show file tree
Hide file tree
Showing 5 changed files with 166 additions and 154 deletions.
19 changes: 16 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Rcpp bindings for [hnswlib](https://github.com/nmslib/hnswlib).

## Status

*February 4 2023* RcppHNSW 0.6.0 is released to CRAN, supporting
*February 4 2024* RcppHNSW 0.6.0 is released to CRAN, supporting
[hnswlib version 0.8.0](https://github.com/nmslib/hnswlib/releases/tag/v0.8.0).

*September 19 2023* RcppHNSW 0.5.0 is released to CRAN, supporting
Expand Down Expand Up @@ -123,6 +123,11 @@ iris_nn <- hnsw_search(irism[101:150, ], ann, k = 5)

## Class Example

As noted in the "Do not use named parameters" section below, you should avoid
using named parameters when using class methods. But I do use them in a few
places below to document the name of the parameters the positional arguments
refer to.

```R
library(RcppHNSW)
data <- as.matrix(iris[, -5])
Expand Down Expand Up @@ -153,7 +158,7 @@ res <- ann$getNNsList(data[1, ], k = 4, include_distances = TRUE)
ann2 <- new(HnswL2, dim, nitems, M, ef)
ann2$addItems(data)
# Retrieve the 4 nearest neighbors for every item in data
res2 <- ann2$getAllNNsList(data, k = 4, include_distances = TRUE)
res2 <- ann2$getAllNNsList(data, 4, TRUE)
# labels of the data are in res$item, distances in res$distance

# If you are able to store your data column-wise, then the overhead of copying
Expand All @@ -162,10 +167,18 @@ data_by_col <- t(data)
ann3 <- new(HnswL2, dim, nitems, M, ef)
ann3$addItemsCol(data_by_col)
# Retrieve the 4 nearest neighbors for every item in data_by_col
res3 <- ann3$getAllNNsListCol(data_by_col, k = 4, include_distances = TRUE)
res3 <- ann3$getAllNNsListCol(data_by_col, 4, TRUE)
# The returned neared neighbor data matrices are also returned column-wise
all(res2$item == t(res3$item) & res2$distance == t(res3$distance))

# Save the index
ann$save("iris.hnsw")

# load it back in: you do need to know the dimension of the original data
ann4 <- new(HnswL2, dim, "iris.hnsw")
# new index should behave like the original
all(ann$getNNs(data[1, ], 4) == ann4$getNNs(data[1, ], 4))

# other distance classes:
# Cosine: HnswCosine
# Inner Product: HnswIP
Expand Down
145 changes: 0 additions & 145 deletions inst/include/RcppPerpendicular/RcppPerpendicular.h

This file was deleted.

97 changes: 97 additions & 0 deletions inst/include/pforr/pforr.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
// Taken from RcppParallel.h and then modified slightly to rename header guards
// and namespaces to avoid any potential clashes. RcppParallel is licensed under
// GPLv2 or later:

// pfor.h a version of parallel for based on RcppParallel
// Copyright (C) 2020 James Melville
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
// USA.

#ifndef PFORR
#define PFORR

#include <thread>
#include <utility>
#include <vector>

namespace pforr {

using IndexRange = std::pair<std::size_t, std::size_t>;

template <typename Worker>
auto worker_thread(Worker &worker, const IndexRange &range) -> void {
try {
worker(range.first, range.second);
} catch (...) {
}
}

// Function to calculate the ranges for a given input
inline auto split_input_range(const IndexRange &range, std::size_t n_threads,
std::size_t grain_size)
-> std::vector<IndexRange> {

// compute grain_size (including enforcing requested minimum)
std::size_t length = range.second - range.first;
if (n_threads == 1)
grain_size = length;
else if ((length % n_threads) == 0) // perfect division
grain_size = (std::max)(length / n_threads, grain_size);
else // imperfect division, divide by threads - 1
grain_size = (std::max)(length / (n_threads - 1), grain_size);

// allocate ranges
std::vector<IndexRange> ranges;
std::size_t begin = range.first;
while (begin < range.second) {
std::size_t end = (std::min)(begin + grain_size, range.second);
ranges.emplace_back(begin, end);
begin = end;
}

return ranges;
}

// Execute the Worker over the IndexRange in parallel
template <typename Worker>
inline void parallel_for(std::size_t begin, std::size_t end, Worker &worker,
std::size_t n_threads, std::size_t grain_size = 1) {
if (n_threads == 0) {
worker(begin, end);
return;
}
// split the work
IndexRange input_range(begin, end);
std::vector<IndexRange> ranges =
split_input_range(input_range, n_threads, grain_size);

std::vector<std::thread> threads;
threads.reserve(ranges.size());
for (auto &range : ranges) {
threads.push_back(
std::thread(&worker_thread<Worker>, std::ref(worker), range));
}

for (auto &thread : threads) {
thread.join();
}

return;
}

} // namespace pforr

#endif // PFORR
12 changes: 6 additions & 6 deletions src/hnsw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

#include "rcpphnsw.h"

#include "RcppPerpendicular/RcppPerpendicular.h"
#include "pforr/pforr.h"

template <typename dist_t, bool DoNormalize = false> struct Normalizer {
static void normalize(std::vector<dist_t> &vec) {}
Expand Down Expand Up @@ -130,7 +130,7 @@ class Hnsw {
addItemImpl(item_copy, index_start + i);
}
};
RcppPerpendicular::parallel_for(nitems, worker, numThreads);
pforr::parallel_for(0, nitems, worker, numThreads);
cur_l = size();
}

Expand Down Expand Up @@ -158,7 +158,7 @@ class Hnsw {
}
};

RcppPerpendicular::parallel_for(nitems, worker, numThreads);
pforr::parallel_for(0, nitems, worker, numThreads);
cur_l = size();
}

Expand Down Expand Up @@ -295,7 +295,7 @@ class Hnsw {
}
};

RcppPerpendicular::parallel_for(nitems, worker, numThreads);
pforr::parallel_for(0, nitems, worker, numThreads);

return found_all;
}
Expand Down Expand Up @@ -428,7 +428,7 @@ class Hnsw {
}
};

RcppPerpendicular::parallel_for(nitems, worker, numThreads);
pforr::parallel_for(0, nitems, worker, numThreads);

return found_all;
}
Expand All @@ -446,7 +446,7 @@ class Hnsw {
}
};

RcppPerpendicular::parallel_for(nitems, worker, numThreads);
pforr::parallel_for(0, nitems, worker, numThreads);

return data;
}
Expand Down
47 changes: 47 additions & 0 deletions tests/testthat/test_save_load.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
library(RcppHNSW)
context("Save/load index")

num_elements <- nrow(uirism)
dim <- ncol(uirism)

M <- 16
ef_construction <- 10
p <- new(HnswL2, dim, num_elements, M, ef_construction)

for (i in 1:num_elements) {
p$addItem(uirism[i, ])
}

nn4idx <- matrix(0L, nrow = num_elements, ncol = 4)
nn4dist <- matrix(0.0, nrow = num_elements, ncol = 4)

for (i in 1:num_elements) {
res <- p$getNNsList(uirism[i, ], k = 4, TRUE)
nn4idx[i, ] <- res$item
nn4dist[i, ] <- res$distance
}

temp_file <- tempfile()
on.exit(unlink(temp_file), add = TRUE)
p$save(temp_file)

nn4idx_aftersave <- matrix(0L, nrow = num_elements, ncol = 4)
nn4dist_aftersave <- matrix(0.0, nrow = num_elements, ncol = 4)
for (i in 1:num_elements) {
res_aftersave <- p$getNNsList(uirism[i, ], k = 4, TRUE)
nn4idx_aftersave[i, ] <- res_aftersave$item
nn4dist_aftersave[i, ] <- res_aftersave$distance
}
expect_equal(nn4idx, nn4idx_aftersave)
expect_equal(nn4dist, nn4dist_aftersave)

pload <- new(HnswL2, dim, temp_file)
nn4idx_afterload <- matrix(0L, nrow = num_elements, ncol = 4)
nn4dist_afterload <- matrix(0.0, nrow = num_elements, ncol = 4)
for (i in 1:num_elements) {
res_afterload <- pload$getNNsList(uirism[i, ], k = 4, TRUE)
nn4idx_afterload[i, ] <- res_afterload$item
nn4dist_afterload[i, ] <- res_afterload$distance
}
expect_equal(nn4idx, nn4idx_afterload)
expect_equal(nn4dist, nn4dist_afterload)

0 comments on commit 2b25140

Please sign in to comment.