Skip to content
This repository has been archived by the owner on Feb 22, 2020. It is now read-only.

Commit

Permalink
feat(indexer): add indexer info
Browse files Browse the repository at this point in the history
  • Loading branch information
jemmyshin committed Sep 10, 2019
1 parent 0ee6961 commit b2b2cc9
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 7 deletions.
46 changes: 39 additions & 7 deletions gnes/indexer/key_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,32 @@ class DictKeyIndexer(BaseKeyIndexer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._key_info = {}
self._all_docs = []

def add(self, keys: List[Tuple[int, int]], weights: List[float], *args, **kwargs) -> int:
for (k, o), w in zip(keys, weights):
self._key_info[k] = o, w
self.update_counter(keys)
return len(self._key_info)

def query(self, keys: List[int], *args, **kwargs) -> List[Tuple[int, int, float]]:
return [(k, *self._key_info[k]) for k in keys]

def update_counter(self, keys: List[Tuple[int, int]]):
self._num_doc = len(self._key_info)
self._num_chunks += len(keys)

@property
def num_chunks(self):
return len(self._key_info)
return self._num_chunks

@property
def num_doc(self):
return self._num_doc

@property
def num_chunks_avg(self):
return self._num_doc / len(self._key_info)


class ListKeyIndexer(BaseKeyIndexer):
Expand All @@ -63,20 +77,19 @@ def update_counter(self, keys: List[Tuple[int, int]]):

def _update_docs(self, keys: List[Tuple[int, int]]):
for key in keys:
if key[0] not in self._all_docs:
self._all_docs.append(key[0])
self._all_docs.append(key[0])

@property
def num_chunks(self):
return len(self._int2key)
return self._num_chunks

@property
def num_doc(self):
return len(self._all_docs)
return len(set(self._all_docs))

@property
def num_chunks_avg(self):
return len(self._int2key) / len(self._all_docs)
return self._num_chunks / len(set(self._all_docs))

# @property
# def num_chunk_per_doc(self):
Expand All @@ -96,6 +109,7 @@ def __init__(self, *args, **kwargs):
self._data_updated = False
self._np_int2key = None
self._np_int2key_weight = None
self._all_docs = []

def _build_np_buffer(self):
if self._data_updated or not self._np_int2key or not self._np_int2key_weight:
Expand Down Expand Up @@ -127,6 +141,7 @@ def __init__(self, buffer_size: int = 10000, col_size: int = 3, *args, **kwargs)
self._col_size = col_size
self._size = 0
self._max_size = self._buffer_size
self._all_docs = []

def add(self, keys: List[Tuple[int, int]], weights: List[float], *args, **kwargs) -> int:
l = len(keys)
Expand All @@ -138,16 +153,33 @@ def add(self, keys: List[Tuple[int, int]], weights: List[float], *args, **kwargs
self._int2key_info[self._size:(self._size + l), 0:(self._col_size - 1)] = np.array(keys)
self._int2key_info[self._size:(self._size + l), self._col_size - 1] = np.array(weights)
self._size += l
self.update_counter(keys)
return self._size

def query(self, keys: List[int], *args, **kwargs) -> List[Tuple[int, int, float]]:
key_offset = self._int2key_info[keys, 0:(self._col_size - 1)].astype(int).tolist()
weights = self._int2key_info[keys, self._col_size - 1].astype(float).tolist()
return [(*ko, w) for ko, w in zip(key_offset, weights)]

def update_counter(self, keys: List[Tuple[int, int]]):
self._update_docs(keys)
self._num_chunks += len(keys)

def _update_docs(self, keys: List[Tuple[int, int]]):
for key in keys:
self._all_docs.append(key[0])

@property
def num_chunks(self):
return self._size
return self._num_chunks

@property
def num_doc(self):
return len(set(self._all_docs))

@property
def num_chunks_avg(self):
return self._num_chunks / len(set(self._all_docs))

@property
def capacity(self):
Expand Down
2 changes: 2 additions & 0 deletions tests/test_annoyindexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ def test_search(self):
a = AnnoyIndexer(5, self.dump_path)
a.add(list(zip(list(range(10)), list(range(10)))), self.toy_data, [1.] * 10)
self.assertEqual(a.num_chunks, 10)
self.assertEqual(a.num_doc, 10)
self.assertEqual(a.num_chunks_avg, 1)
top_1 = [i[0][0] for i in a.query(self.toy_data, top_k=1)]
self.assertEqual(top_1, list(range(10)))
a.close()
Expand Down
3 changes: 3 additions & 0 deletions tests/test_bindexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ def tearDown(self):
def test_nsw_search(self):
fd = BIndexer(self.toy_data.shape[1], data_path=self.dump_path + '_1')
fd.add(self.toy_label, self.toy_data, self.weights)
self.assertEqual(fd.num_doc, 7)
self.assertEqual(fd.num_chunks, 7)
self.assertEqual(fd.num_chunks_avg, 1)

rs = fd.query(self.toy_query, 2, method='nsw', normalized_score=False)
for i in range(len(rs)):
Expand Down
1 change: 1 addition & 0 deletions tests/test_dict_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def init_db(self):
preprocess.apply(self.d)

self.db.add(list(range(len(self.video_bytes))), [self.d])
self.assertEqual(self.db.num_doc, len(self.video_bytes))

def test_add_docs(self):
# self.init_db()
Expand Down
3 changes: 3 additions & 0 deletions tests/test_simple_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ def test_bench_numpy_list(self):
with TimeContext('%s:add()' % cls.__name__):
for k, w in zip(batch_iterator(self.key_offset, b_size), batch_iterator(self.weights, b_size)):
a.add(k, w)
self.assertEqual(a.num_doc, 1000000)
self.assertEqual(a.num_chunks, 1000000)
self.assertEqual(a.num_chunks_avg, 1)

with TimeContext('%s:query()' % cls.__name__):
for k in batch_iterator(self.query, b_size):
Expand Down

0 comments on commit b2b2cc9

Please sign in to comment.