Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Loadindex #20

Merged
merged 3 commits into from
Mar 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Rcpp bindings for [hnswlib](https://github.com/nmslib/hnswlib).

## Status

*February 4 2023* RcppHNSW 0.6.0 is released to CRAN, supporting
*February 4 2024* RcppHNSW 0.6.0 is released to CRAN, supporting
[hnswlib version 0.8.0](https://github.com/nmslib/hnswlib/releases/tag/v0.8.0).

*September 19 2023* RcppHNSW 0.5.0 is released to CRAN, supporting
Expand Down Expand Up @@ -123,6 +123,11 @@ iris_nn <- hnsw_search(irism[101:150, ], ann, k = 5)

## Class Example

As noted in the "Do not use named parameters" section below, you should avoid
using named parameters when using class methods. But I do use them in a few
places below to document the name of the parameters the positional arguments
refer to.

```R
library(RcppHNSW)
data <- as.matrix(iris[, -5])
Expand Down Expand Up @@ -153,7 +158,7 @@ res <- ann$getNNsList(data[1, ], k = 4, include_distances = TRUE)
ann2 <- new(HnswL2, dim, nitems, M, ef)
ann2$addItems(data)
# Retrieve the 4 nearest neighbors for every item in data
res2 <- ann2$getAllNNsList(data, k = 4, include_distances = TRUE)
res2 <- ann2$getAllNNsList(data, 4, TRUE)
# labels of the data are in res$item, distances in res$distance

# If you are able to store your data column-wise, then the overhead of copying
Expand All @@ -162,10 +167,18 @@ data_by_col <- t(data)
ann3 <- new(HnswL2, dim, nitems, M, ef)
ann3$addItemsCol(data_by_col)
# Retrieve the 4 nearest neighbors for every item in data_by_col
res3 <- ann3$getAllNNsListCol(data_by_col, k = 4, include_distances = TRUE)
res3 <- ann3$getAllNNsListCol(data_by_col, 4, TRUE)
# The returned neared neighbor data matrices are also returned column-wise
all(res2$item == t(res3$item) & res2$distance == t(res3$distance))

# Save the index
ann$save("iris.hnsw")

# load it back in: you do need to know the dimension of the original data
ann4 <- new(HnswL2, dim, "iris.hnsw")
# new index should behave like the original
all(ann$getNNs(data[1, ], 4) == ann4$getNNs(data[1, ], 4))

# other distance classes:
# Cosine: HnswCosine
# Inner Product: HnswIP
Expand Down
145 changes: 0 additions & 145 deletions inst/include/RcppPerpendicular/RcppPerpendicular.h

This file was deleted.

97 changes: 97 additions & 0 deletions inst/include/pforr/pforr.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
// Taken from RcppParallel.h and then modified slightly to rename header guards
// and namespaces to avoid any potential clashes. RcppParallel is licensed under
// GPLv2 or later:

// pfor.h a version of parallel for based on RcppParallel
// Copyright (C) 2020 James Melville
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
// USA.

#ifndef PFORR
#define PFORR

#include <thread>
#include <utility>
#include <vector>

namespace pforr {

using IndexRange = std::pair<std::size_t, std::size_t>;

template <typename Worker>
auto worker_thread(Worker &worker, const IndexRange &range) -> void {
try {
worker(range.first, range.second);
} catch (...) {
}
}

// Function to calculate the ranges for a given input
inline auto split_input_range(const IndexRange &range, std::size_t n_threads,
std::size_t grain_size)
-> std::vector<IndexRange> {

// compute grain_size (including enforcing requested minimum)
std::size_t length = range.second - range.first;
if (n_threads == 1)
grain_size = length;
else if ((length % n_threads) == 0) // perfect division
grain_size = (std::max)(length / n_threads, grain_size);
else // imperfect division, divide by threads - 1
grain_size = (std::max)(length / (n_threads - 1), grain_size);

// allocate ranges
std::vector<IndexRange> ranges;
std::size_t begin = range.first;
while (begin < range.second) {
std::size_t end = (std::min)(begin + grain_size, range.second);
ranges.emplace_back(begin, end);
begin = end;
}

return ranges;
}

// Execute the Worker over the IndexRange in parallel
template <typename Worker>
inline void parallel_for(std::size_t begin, std::size_t end, Worker &worker,
std::size_t n_threads, std::size_t grain_size = 1) {
if (n_threads == 0) {
worker(begin, end);
return;
}
// split the work
IndexRange input_range(begin, end);
std::vector<IndexRange> ranges =
split_input_range(input_range, n_threads, grain_size);

std::vector<std::thread> threads;
threads.reserve(ranges.size());
for (auto &range : ranges) {
threads.push_back(
std::thread(&worker_thread<Worker>, std::ref(worker), range));
}

for (auto &thread : threads) {
thread.join();
}

return;
}

} // namespace pforr

#endif // PFORR
12 changes: 6 additions & 6 deletions src/hnsw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

#include "rcpphnsw.h"

#include "RcppPerpendicular/RcppPerpendicular.h"
#include "pforr/pforr.h"

template <typename dist_t, bool DoNormalize = false> struct Normalizer {
static void normalize(std::vector<dist_t> &vec) {}
Expand Down Expand Up @@ -130,7 +130,7 @@ class Hnsw {
addItemImpl(item_copy, index_start + i);
}
};
RcppPerpendicular::parallel_for(nitems, worker, numThreads);
pforr::parallel_for(0, nitems, worker, numThreads);
cur_l = size();
}

Expand Down Expand Up @@ -158,7 +158,7 @@ class Hnsw {
}
};

RcppPerpendicular::parallel_for(nitems, worker, numThreads);
pforr::parallel_for(0, nitems, worker, numThreads);
cur_l = size();
}

Expand Down Expand Up @@ -295,7 +295,7 @@ class Hnsw {
}
};

RcppPerpendicular::parallel_for(nitems, worker, numThreads);
pforr::parallel_for(0, nitems, worker, numThreads);

return found_all;
}
Expand Down Expand Up @@ -428,7 +428,7 @@ class Hnsw {
}
};

RcppPerpendicular::parallel_for(nitems, worker, numThreads);
pforr::parallel_for(0, nitems, worker, numThreads);

return found_all;
}
Expand All @@ -446,7 +446,7 @@ class Hnsw {
}
};

RcppPerpendicular::parallel_for(nitems, worker, numThreads);
pforr::parallel_for(0, nitems, worker, numThreads);

return data;
}
Expand Down
47 changes: 47 additions & 0 deletions tests/testthat/test_save_load.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
library(RcppHNSW)
context("Save/load index")

num_elements <- nrow(uirism)
dim <- ncol(uirism)

M <- 16
ef_construction <- 10
p <- new(HnswL2, dim, num_elements, M, ef_construction)

for (i in 1:num_elements) {
p$addItem(uirism[i, ])
}

nn4idx <- matrix(0L, nrow = num_elements, ncol = 4)
nn4dist <- matrix(0.0, nrow = num_elements, ncol = 4)

for (i in 1:num_elements) {
res <- p$getNNsList(uirism[i, ], k = 4, TRUE)
nn4idx[i, ] <- res$item
nn4dist[i, ] <- res$distance
}

temp_file <- tempfile()
on.exit(unlink(temp_file), add = TRUE)
p$save(temp_file)

nn4idx_aftersave <- matrix(0L, nrow = num_elements, ncol = 4)
nn4dist_aftersave <- matrix(0.0, nrow = num_elements, ncol = 4)
for (i in 1:num_elements) {
res_aftersave <- p$getNNsList(uirism[i, ], k = 4, TRUE)
nn4idx_aftersave[i, ] <- res_aftersave$item
nn4dist_aftersave[i, ] <- res_aftersave$distance
}
expect_equal(nn4idx, nn4idx_aftersave)
expect_equal(nn4dist, nn4dist_aftersave)

pload <- new(HnswL2, dim, temp_file)
nn4idx_afterload <- matrix(0L, nrow = num_elements, ncol = 4)
nn4dist_afterload <- matrix(0.0, nrow = num_elements, ncol = 4)
for (i in 1:num_elements) {
res_afterload <- pload$getNNsList(uirism[i, ], k = 4, TRUE)
nn4idx_afterload[i, ] <- res_afterload$item
nn4dist_afterload[i, ] <- res_afterload$distance
}
expect_equal(nn4idx, nn4idx_afterload)
expect_equal(nn4dist, nn4dist_afterload)
Loading