Skip to content

Commit

Permalink
Fix load bugs/messages, update test, deprecate old indices (#148)
Browse files Browse the repository at this point in the history
* temp debug state

* fix bug in loading index with deleted elements

* adjust condition in test

* add check for file existence

* cleanup
  • Loading branch information
yurymalkov authored Sep 16, 2019
1 parent b3671c5 commit c5c38f0
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 18 deletions.
37 changes: 23 additions & 14 deletions hnswlib/hnswalg.h
Original file line number Diff line number Diff line change
Expand Up @@ -595,6 +595,10 @@ namespace hnswlib {

std::ifstream input(location, std::ios::binary);

if (!input.is_open())
throw std::runtime_error("Cannot open file");


// get file size:
input.seekg(0,input.end);
std::streampos total_filesize=input.tellg();
Expand Down Expand Up @@ -625,16 +629,15 @@ namespace hnswlib {
fstdistfunc_ = s->get_dist_func();
dist_func_param_ = s->get_dist_func_param();

/// Legacy, check that everything is ok

bool old_index=false;

auto pos=input.tellg();


/// Optional - check if index is ok:

input.seekg(cur_element_count * size_data_per_element_,input.cur);
for (size_t i = 0; i < cur_element_count; i++) {
if(input.tellg() < 0 || input.tellg()>=total_filesize){
old_index = true;
break;
throw std::runtime_error("Index seems to be corrupted or unsupported");
}

unsigned int linkListSize;
Expand All @@ -644,23 +647,21 @@ namespace hnswlib {
}
}

// check if file is ok, if not this is either corrupted or old index
// throw exception if it either corrupted or old index
if(input.tellg()!=total_filesize)
old_index = true;
throw std::runtime_error("Index seems to be corrupted or unsupported");

if (old_index) {
std::cerr << "Warning: loading of old indexes will be deprecated before 2019.\n"
<< "Please resave the index in the new format.\n";
}
input.clear();

/// Optional check end

input.seekg(pos,input.beg);


data_level0_memory_ = (char *) malloc(max_elements * size_data_per_element_);
input.read(data_level0_memory_, cur_element_count * size_data_per_element_);

if(old_index)
input.seekg(((max_elements_-cur_element_count) * size_data_per_element_), input.cur);



size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint);
Expand Down Expand Up @@ -691,6 +692,14 @@ namespace hnswlib {
input.read(linkLists_[i], linkListSize);
}
}

has_deletions_=false;

for (size_t i = 0; i < cur_element_count; i++) {
if(isMarkedDeleted(i))
has_deletions_=true;
}

input.close();

return;
Expand Down
20 changes: 17 additions & 3 deletions python_bindings/tests/bindings_test_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@

class RandomSelfTestCase(unittest.TestCase):
def testRandomSelf(self):
for idx in range(16):
print("\n**** Index save-load test ****\n")
import hnswlib
import numpy as np


np.random.seed(idx)
dim = 16
num_elements = 10000

Expand Down Expand Up @@ -95,8 +97,8 @@ def testRandomSelf(self):
p.mark_deleted(l[0])
labels2, _ = p.knn_query(data2, k=1)
items=p.get_items(labels2)
diff_with_gt_labels=np.max(np.abs(data2-items))
self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4) # console
diff_with_gt_labels=np.mean(np.abs(data2-items))
self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-3) # console


labels1_after, _ = p.knn_query(data1, k=1)
Expand All @@ -106,6 +108,18 @@ def testRandomSelf(self):
self.assertTrue(False)
print("All the data in data1 are removed")

# checking saving/loading index with elements marked as deleted
p.save_index("with_deleted.bin")
p = hnswlib.Index(space='l2', dim=dim)
p.load_index("with_deleted.bin")
p.set_ef(100)

labels1_after, _ = p.knn_query(data1, k=1)
for la in labels1_after:
for lb in labels1:
if la[0] == lb[0]:
self.assertTrue(False)



if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion python_bindings/tests/bindings_test_resize.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

class RandomSelfTestCase(unittest.TestCase):
def testRandomSelf(self):
for idx in range(32):
for idx in range(16):
print("\n**** Index resize test ****\n")
import hnswlib
import numpy as np
Expand Down

0 comments on commit c5c38f0

Please sign in to comment.