diff --git a/CMakeLists.txt b/CMakeLists.txt index c292a59a..b1cde265 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -62,5 +62,7 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) add_executable(api_tests tests/cpp/api_test.cpp) target_link_libraries(api_tests hnswlib) -endif() + add_executable(hnsw-fsck hnsw-fsck.cpp) + target_link_libraries(hnsw-fsck hnswlib) +endif() diff --git a/hnsw-fsck.cpp b/hnsw-fsck.cpp new file mode 100644 index 00000000..305a1fba --- /dev/null +++ b/hnsw-fsck.cpp @@ -0,0 +1,40 @@ +#include +#include "hnswlib/hnswlib.h" + +int main(int argc, const char* argv[]) { + if (argc != 4) { + std::cerr << "USAGE: hnsw-fsck \n"; + return 1; + } + std::string index_path(argv[1]); + std::string space_name(argv[2]); + int dim = atoi(argv[3]); + std::string index_file = index_path; + hnswlib::SpaceInterface *l2space; + bool normalize = false; + + if (space_name == "l2") + { + l2space = new hnswlib::L2Space(dim); + normalize = false; + } + else if (space_name == "ip") + { + l2space = new hnswlib::InnerProductSpace(dim); + // For IP, we expect the vectors to be normalized + normalize = false; + } + else if (space_name == "cosine") + { + l2space = new hnswlib::InnerProductSpace(dim); + normalize = true; + } + else + { + std::cerr << "Unknown space name: " << space_name << std::endl; + return 2; + } + + auto appr_alg = new hnswlib::HierarchicalNSW(l2space, index_file, false, 0, false/*allow_replace_deleted*/, normalize, true /*is_persistent_index*/); + appr_alg->checkIntegrity(); +} diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index e4e8150f..684d6752 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -9,6 +9,7 @@ #include #include #include +#include namespace hnswlib { @@ -1835,8 +1836,10 @@ namespace hnswlib std::unordered_set s; for (int j = 0; j < size; j++) { - if (data[j] < 0 || data[j] >= cur_element_count || data[j] == i) - throw std::runtime_error("HNSW Integrity failure: invalid neighbor index"); + if (data[j] >= cur_element_count) + throw std::runtime_error("HNSW Integrity failure: invalid neighbor index data[j] >= cur_element_count"); + if (data[j] == i) + throw std::runtime_error("HNSW Integrity failure: invalid neighbor index data[j] == i"); inbound_connections_num[data[j]]++; s.insert(data[j]); connections_checked++; @@ -1850,8 +1853,14 @@ namespace hnswlib int min1 = inbound_connections_num[0], max1 = inbound_connections_num[0]; for (int i = 0; i < cur_element_count; i++) { + /* // This should always be true regardless the data is corrupted or not - assert(inbound_connections_num[i] > 0); + if (inbound_connections_num[i] <= 0) { + std::ostringstream ostr; + ostr << "HNSW Integrity failure: inbound_connections_num[" << i << "] = " << inbound_connections_num[i] << " <= 0"; + throw std::runtime_error(ostr.str()); + } + */ min1 = std::min(inbound_connections_num[i], min1); max1 = std::max(inbound_connections_num[i], max1); } diff --git a/tests/cpp/updates_test.cpp b/tests/cpp/updates_test.cpp index 961dfefd..8544b6cf 100644 --- a/tests/cpp/updates_test.cpp +++ b/tests/cpp/updates_test.cpp @@ -250,6 +250,7 @@ int main(int argc, char **argv) // Adding enterpoint: appr_alg.addPoint((void *)dummy_batch.data(), (size_t)0); + appr_alg.checkIntegrity(); StopW stopw = StopW();