Skip to content
This repository has been archived by the owner on Feb 22, 2020. It is now read-only.

Commit

Permalink
Merge pull request #55 from gnes-ai/normalize_weight
Browse files Browse the repository at this point in the history
fix(indexer): normalize weight
  • Loading branch information
Larryjianfeng authored Jul 25, 2019
2 parents 86f4527 + 689abe3 commit 50269c7
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 16 deletions.
12 changes: 12 additions & 0 deletions gnes/indexer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ def add(self, keys: Any, docs: Any, weights: List[float], *args, **kwargs):
def query(self, keys: Any, *args, **kwargs) -> List[Any]:
pass

def normalize_score(self, *args, **kwargs):
pass


class BaseVectorIndexer(BaseIndexer):

Expand All @@ -40,6 +43,9 @@ def add(self, keys: List[Tuple[int, int]], vectors: np.ndarray, weights: List[fl
def query(self, keys: np.ndarray, top_k: int, *args, **kwargs) -> List[List[Tuple]]:
pass

def normalize_score(self, *args, **kwargs):
pass


class BaseTextIndexer(BaseIndexer):

Expand All @@ -49,6 +55,9 @@ def add(self, keys: List[int], docs: Any, weights: List[float], *args, **kwargs)
def query(self, keys: List[int], *args, **kwargs) -> List[Any]:
pass

def normalize_score(self, *args, **kwargs):
pass


class BaseKeyIndexer(BaseIndexer):

Expand All @@ -58,6 +67,9 @@ def add(self, keys: List[Tuple[int, int]], weights: List[float], *args, **kwargs
def query(self, keys: List[int], *args, **kwargs) -> List[Tuple[int, int, float]]:
pass

def normalize_score(self, *args, **kwargs):
pass


class JointIndexer(CompositionalEncoder):

Expand Down
16 changes: 15 additions & 1 deletion gnes/indexer/vector/annoy.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,24 @@ def query(self, keys: 'np.ndarray', top_k: int, *args, **kwargs) -> List[List[Tu
res = []
for k in keys:
ret, relevance_score = self._index.get_nns_by_vector(k, top_k, include_distances=True)
relevance_score = self.normalize_score(relevance_score, self.metric)
chunk_info = self._key_info_indexer.query(ret)
res.append([(*r, -s) for r, s in zip(chunk_info, relevance_score)])
res.append([(*r, s) for r, s in zip(chunk_info, relevance_score)])
return res

def normalize_score(self, score: List[float], metrics: str, *args) -> List[float]:
if metrics == 'angular':
return list(map(lambda x:1 / (1 + x), score))
elif metrics == 'euclidean':
import math
return list(map(lambda x:1 / (1 + math.sqrt(x) / self.num_dim), score))
elif metrics == 'manhattan':
return list(map(lambda x:1 / (1 + x / self.num_dim), score))
elif metrics == 'hamming':
return list(map(lambda x:1 / (1 + x), score))
elif metrics == 'dot':
pass

@property
def size(self):
return self._index.get_n_items()
Expand Down
14 changes: 6 additions & 8 deletions gnes/indexer/vector/bindexer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ def add(self, keys: List[Tuple[int, int]], vectors: np.ndarray, weights: List[fl
def query(self,
keys: np.ndarray,
top_k: int,
normalized_score: bool = True,
method: str = 'nsw',
*args,
**kwargs) -> List[List[Tuple]]:
Expand All @@ -93,17 +92,15 @@ def query(self,
q_idx, doc_ids, offsets, weights = self.bindexer.find_batch_trie(
keys, num_rows)
for (i, q, o, w) in zip(doc_ids, q_idx, offsets, weights):
result[q].append((i, o, w / self._weight_norm, 1 if normalized_score else self.num_bytes))
result[q].append((i, o, w / self._weight_norm, 1))

# search the indexed items with similar value
doc_ids, offsets, weights, dists, q_idx = self.bindexer.nsw_search(
keys, num_rows, top_k)
for (i, o, w, d, q) in zip(doc_ids, offsets, weights, dists, q_idx):
if d == 0:
continue
result[q].append(
(i, o, w / self._weight_norm,
(1. - d / self.num_bytes) if normalized_score else self.num_bytes - d))
result[q].append((i, o, w / self._weight_norm, self.normalize_score(d)))

# get the top-k
for q in range(num_rows):
Expand All @@ -112,11 +109,12 @@ def query(self,
doc_ids, offsets, weights, dists, q_idx = self.bindexer.force_search(
keys, num_rows, top_k)
for (i, o, w, d, q) in zip(doc_ids, offsets, weights, dists, q_idx):
result[q].append(
(i, o, w / self._weight_norm,
(1. - d / self.num_bytes) if normalized_score else self.num_bytes - d))
result[q].append((i, o, w / self._weight_norm, self.normalize_score(d)))
return result

def normalize_score(self, distance: int, *args) -> float:
return 1. - distance / self.num_bytes

def __getstate__(self):
self.bindexer.save(self.data_path)
d = super().__getstate__()
Expand Down
9 changes: 8 additions & 1 deletion gnes/indexer/vector/faiss.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,16 +61,23 @@ def query(self, keys: np.ndarray, top_k: int, *args, **kwargs) -> List[List[Tupl
raise ValueError("vectors should be ndarray of float32")

score, ids = self._faiss_index.search(keys, top_k)
score = self.normalize_score(score)
ret = []
for _id, _score in zip(ids, score):
ret_i = []
chunk_info = self._key_info_indexer.query(_id)
for c_info, _score_i in zip(chunk_info, _score):
ret_i.append((*c_info, -_score_i))
ret_i.append((*c_info, _score_i))
ret.append(ret_i)

return ret

def normalize_score(self, score: np.ndarray, *args) -> np.ndarray:
if 'HNSW' in self.index_key:
return 1 / (1 + np.sqrt(score) / self.num_dim)
elif 'PQ' or 'Flat' in self.index_key:
return 1 / (1 + np.abs(np.sqrt(score)))

@property
def size(self):
return self._faiss_index.ntotal
Expand Down
7 changes: 4 additions & 3 deletions gnes/indexer/vector/hbindexer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ def add(self, keys: List[Tuple[int, int]], vectors: np.ndarray, weights: List[fl
def query(self,
vectors: np.ndarray,
top_k: int,
normalized_score: bool = True,
*args,
**kwargs) -> List[List[Tuple]]:

Expand All @@ -89,11 +88,13 @@ def query(self,
doc_ids, offsets, weights, dists, q_idx = self.hbindexer.query(
vectors, clusters, n, top_k * self.n_idx)
for (i, o, w, d, q) in zip(doc_ids, offsets, weights, dists, q_idx):
result[q][(i, o, w / self._weight_norm)] = (
1. - d / self.n_bytes * 8) if normalized_score else self.n_bytes * 8 - d
result[q][(i, o, w / self._weight_norm)] = self.normalize_score(d)

return [sorted(ret.items(), key=lambda x: -x[1])[:top_k] for ret in result]

def normalize_score(self, distance: int, *args) -> float:
return 1. - distance / self.n_bytes * 8

def __getstate__(self):
self.hbindexer.save(self.data_path)
d = super().__getstate__()
Expand Down
2 changes: 1 addition & 1 deletion gnes/service/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def _handler_chunk_search(self, msg: 'gnes_pb2.Message'):
r.chunk.doc_id = _doc_id
r.chunk.offset_1d = _offset
r.chunk.weight = _weight
r.score = _weight * qc_weight * (-1 / _relevance)
r.score = _weight * qc_weight * _relevance
r.score_explained = '[chunk_score at doc: %d, offset: %d] = ' \
'(doc_chunk_weight: %.6f) * ' \
'(query_doc_chunk_relevance: %.6f) * ' \
Expand Down
4 changes: 2 additions & 2 deletions tests/test_bindexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ def setUp(self):
[2, 1, 3, 4],
[3, 2, 1, 2]]).astype(np.uint8)

self.toy_exp = [[(234, 0, 1., 4,), (123, 1, 1., 4)], [(432, 0, 1., 4), (1, 0, 1., 4)],
[(234, 0, 1., 3), (123, 1, 1., 3)]]
self.toy_exp = [[(234, 0, 1., 1,), (123, 1, 1., 1)], [(432, 0, 1., 1), (1, 0, 1., 1)],
[(234, 0, 1., 0.75), (123, 1, 1., 0.75)]]
self.weights = [1.] * len(self.toy_label)

dirname = os.path.dirname(__file__)
Expand Down

0 comments on commit 50269c7

Please sign in to comment.