Merge pull request #55 from gnes-ai/normalize_weight

fix(indexer): normalize weight
gnes-ai · Jul 25, 2019 · 50269c7 · 50269c7
2 parents 86f4527 + 689abe3
commit 50269c7
Show file tree

Hide file tree

Showing 7 changed files with 48 additions and 16 deletions.
diff --git a/gnes/indexer/base.py b/gnes/indexer/base.py
@@ -31,6 +31,9 @@ def add(self, keys: Any, docs: Any, weights: List[float], *args, **kwargs):
     def query(self, keys: Any, *args, **kwargs) -> List[Any]:
         pass
 
+    def normalize_score(self, *args, **kwargs):
+        pass
+
 
 class BaseVectorIndexer(BaseIndexer):
 
@@ -40,6 +43,9 @@ def add(self, keys: List[Tuple[int, int]], vectors: np.ndarray, weights: List[fl
     def query(self, keys: np.ndarray, top_k: int, *args, **kwargs) -> List[List[Tuple]]:
         pass
 
+    def normalize_score(self, *args, **kwargs):
+        pass
+
 
 class BaseTextIndexer(BaseIndexer):
 
@@ -49,6 +55,9 @@ def add(self, keys: List[int], docs: Any, weights: List[float], *args, **kwargs)
     def query(self, keys: List[int], *args, **kwargs) -> List[Any]:
         pass
 
+    def normalize_score(self, *args, **kwargs):
+        pass
+
 
 class BaseKeyIndexer(BaseIndexer):
 
@@ -58,6 +67,9 @@ def add(self, keys: List[Tuple[int, int]], weights: List[float], *args, **kwargs
     def query(self, keys: List[int], *args, **kwargs) -> List[Tuple[int, int, float]]:
         pass
 
+    def normalize_score(self, *args, **kwargs):
+        pass
+
 
 class JointIndexer(CompositionalEncoder):
 

diff --git a/gnes/indexer/vector/annoy.py b/gnes/indexer/vector/annoy.py
@@ -65,10 +65,24 @@ def query(self, keys: 'np.ndarray', top_k: int, *args, **kwargs) -> List[List[Tu
         res = []
         for k in keys:
             ret, relevance_score = self._index.get_nns_by_vector(k, top_k, include_distances=True)
+            relevance_score = self.normalize_score(relevance_score, self.metric)
             chunk_info = self._key_info_indexer.query(ret)
-            res.append([(*r, -s) for r, s in zip(chunk_info, relevance_score)])
+            res.append([(*r, s) for r, s in zip(chunk_info, relevance_score)])
         return res
 
+    def normalize_score(self, score: List[float], metrics: str, *args) -> List[float]:
+        if metrics == 'angular':
+            return list(map(lambda x:1 / (1 + x), score))
+        elif metrics == 'euclidean':
+            import math
+            return list(map(lambda x:1 / (1 + math.sqrt(x) / self.num_dim), score))
+        elif metrics == 'manhattan':
+            return list(map(lambda x:1 / (1 + x / self.num_dim), score))
+        elif metrics == 'hamming':
+            return list(map(lambda x:1 / (1 + x), score))
+        elif metrics == 'dot':
+            pass
+
     @property
     def size(self):
         return self._index.get_n_items()

diff --git a/gnes/indexer/vector/bindexer/__init__.py b/gnes/indexer/vector/bindexer/__init__.py
@@ -74,7 +74,6 @@ def add(self, keys: List[Tuple[int, int]], vectors: np.ndarray, weights: List[fl
     def query(self,
               keys: np.ndarray,
               top_k: int,
-              normalized_score: bool = True,
               method: str = 'nsw',
               *args,
               **kwargs) -> List[List[Tuple]]:
@@ -93,17 +92,15 @@ def query(self,
             q_idx, doc_ids, offsets, weights = self.bindexer.find_batch_trie(
                 keys, num_rows)
             for (i, q, o, w) in zip(doc_ids, q_idx, offsets, weights):
-                result[q].append((i, o, w / self._weight_norm, 1 if normalized_score else self.num_bytes))
+                result[q].append((i, o, w / self._weight_norm, 1))
 
             # search the indexed items with similar value
             doc_ids, offsets, weights, dists, q_idx = self.bindexer.nsw_search(
                 keys, num_rows, top_k)
             for (i, o, w, d, q) in zip(doc_ids, offsets, weights, dists, q_idx):
                 if d == 0:
                     continue
-                result[q].append(
-                    (i, o, w / self._weight_norm,
-                     (1. - d / self.num_bytes) if normalized_score else self.num_bytes - d))
+                result[q].append((i, o, w / self._weight_norm, self.normalize_score(d)))
 
             # get the top-k
             for q in range(num_rows):
@@ -112,11 +109,12 @@ def query(self,
             doc_ids, offsets, weights, dists, q_idx = self.bindexer.force_search(
                 keys, num_rows, top_k)
             for (i, o, w, d, q) in zip(doc_ids, offsets, weights, dists, q_idx):
-                result[q].append(
-                    (i, o, w / self._weight_norm,
-                     (1. - d / self.num_bytes) if normalized_score else self.num_bytes - d))
+                result[q].append((i, o, w / self._weight_norm, self.normalize_score(d)))
         return result
 
+    def normalize_score(self, distance: int, *args) -> float:
+        return 1. - distance / self.num_bytes
+
     def __getstate__(self):
         self.bindexer.save(self.data_path)
         d = super().__getstate__()

diff --git a/gnes/indexer/vector/faiss.py b/gnes/indexer/vector/faiss.py
@@ -61,16 +61,23 @@ def query(self, keys: np.ndarray, top_k: int, *args, **kwargs) -> List[List[Tupl
             raise ValueError("vectors should be ndarray of float32")
 
         score, ids = self._faiss_index.search(keys, top_k)
+        score = self.normalize_score(score)
         ret = []
         for _id, _score in zip(ids, score):
             ret_i = []
             chunk_info = self._key_info_indexer.query(_id)
             for c_info, _score_i in zip(chunk_info, _score):
-                ret_i.append((*c_info, -_score_i))
+                ret_i.append((*c_info, _score_i))
             ret.append(ret_i)
 
         return ret
 
+    def normalize_score(self, score: np.ndarray, *args) -> np.ndarray:
+        if 'HNSW' in self.index_key:
+            return 1 / (1 + np.sqrt(score) / self.num_dim)
+        elif 'PQ' or 'Flat' in self.index_key:
+            return 1 / (1 + np.abs(np.sqrt(score)))
+
     @property
     def size(self):
         return self._faiss_index.ntotal

diff --git a/gnes/indexer/vector/hbindexer/__init__.py b/gnes/indexer/vector/hbindexer/__init__.py
@@ -72,7 +72,6 @@ def add(self, keys: List[Tuple[int, int]], vectors: np.ndarray, weights: List[fl
     def query(self,
               vectors: np.ndarray,
               top_k: int,
-              normalized_score: bool = True,
               *args,
               **kwargs) -> List[List[Tuple]]:
 
@@ -89,11 +88,13 @@ def query(self,
         doc_ids, offsets, weights, dists, q_idx = self.hbindexer.query(
             vectors, clusters, n, top_k * self.n_idx)
         for (i, o, w, d, q) in zip(doc_ids, offsets, weights, dists, q_idx):
-            result[q][(i, o, w / self._weight_norm)] = (
-                        1. - d / self.n_bytes * 8) if normalized_score else self.n_bytes * 8 - d
+            result[q][(i, o, w / self._weight_norm)] = self.normalize_score(d)
 
         return [sorted(ret.items(), key=lambda x: -x[1])[:top_k] for ret in result]
 
+    def normalize_score(self, distance: int, *args) -> float:
+        return 1. - distance / self.n_bytes * 8
+
     def __getstate__(self):
         self.hbindexer.save(self.data_path)
         d = super().__getstate__()

diff --git a/gnes/service/indexer.py b/gnes/service/indexer.py
@@ -66,7 +66,7 @@ def _handler_chunk_search(self, msg: 'gnes_pb2.Message'):
                 r.chunk.doc_id = _doc_id
                 r.chunk.offset_1d = _offset
                 r.chunk.weight = _weight
-                r.score = _weight * qc_weight * (-1 / _relevance)
+                r.score = _weight * qc_weight * _relevance
                 r.score_explained = '[chunk_score at doc: %d, offset: %d] = ' \
                                     '(doc_chunk_weight: %.6f) * ' \
                                     '(query_doc_chunk_relevance: %.6f) * ' \

diff --git a/tests/test_bindexer.py b/tests/test_bindexer.py
@@ -21,8 +21,8 @@ def setUp(self):
                                    [2, 1, 3, 4],
                                    [3, 2, 1, 2]]).astype(np.uint8)
 
-        self.toy_exp = [[(234, 0, 1., 4,), (123, 1, 1., 4)], [(432, 0, 1., 4), (1, 0, 1., 4)],
-                        [(234, 0, 1., 3), (123, 1, 1., 3)]]
+        self.toy_exp = [[(234, 0, 1., 1,), (123, 1, 1., 1)], [(432, 0, 1., 1), (1, 0, 1., 1)],
+                         [(234, 0, 1., 0.75), (123, 1, 1., 0.75)]]
         self.weights = [1.] * len(self.toy_label)
 
         dirname = os.path.dirname(__file__)