diff --git a/gnes/indexer/base.py b/gnes/indexer/base.py index c76c8b5f..57d95c8e 100644 --- a/gnes/indexer/base.py +++ b/gnes/indexer/base.py @@ -31,6 +31,9 @@ def add(self, keys: Any, docs: Any, weights: List[float], *args, **kwargs): def query(self, keys: Any, *args, **kwargs) -> List[Any]: pass + def normalize_score(self, *args, **kwargs): + pass + class BaseVectorIndexer(BaseIndexer): @@ -40,6 +43,9 @@ def add(self, keys: List[Tuple[int, int]], vectors: np.ndarray, weights: List[fl def query(self, keys: np.ndarray, top_k: int, *args, **kwargs) -> List[List[Tuple]]: pass + def normalize_score(self, *args, **kwargs): + pass + class BaseTextIndexer(BaseIndexer): @@ -49,6 +55,9 @@ def add(self, keys: List[int], docs: Any, weights: List[float], *args, **kwargs) def query(self, keys: List[int], *args, **kwargs) -> List[Any]: pass + def normalize_score(self, *args, **kwargs): + pass + class BaseKeyIndexer(BaseIndexer): @@ -58,6 +67,9 @@ def add(self, keys: List[Tuple[int, int]], weights: List[float], *args, **kwargs def query(self, keys: List[int], *args, **kwargs) -> List[Tuple[int, int, float]]: pass + def normalize_score(self, *args, **kwargs): + pass + class JointIndexer(CompositionalEncoder): diff --git a/gnes/indexer/vector/annoy.py b/gnes/indexer/vector/annoy.py index cacf4c64..587ad067 100644 --- a/gnes/indexer/vector/annoy.py +++ b/gnes/indexer/vector/annoy.py @@ -65,10 +65,24 @@ def query(self, keys: 'np.ndarray', top_k: int, *args, **kwargs) -> List[List[Tu res = [] for k in keys: ret, relevance_score = self._index.get_nns_by_vector(k, top_k, include_distances=True) + relevance_score = self.normalize_score(relevance_score, self.metric) chunk_info = self._key_info_indexer.query(ret) - res.append([(*r, -s) for r, s in zip(chunk_info, relevance_score)]) + res.append([(*r, s) for r, s in zip(chunk_info, relevance_score)]) return res + def normalize_score(self, score: List[float], metrics: str, *args) -> List[float]: + if metrics == 'angular': + return list(map(lambda x:1 / (1 + x), score)) + elif metrics == 'euclidean': + import math + return list(map(lambda x:1 / (1 + math.sqrt(x) / self.num_dim), score)) + elif metrics == 'manhattan': + return list(map(lambda x:1 / (1 + x / self.num_dim), score)) + elif metrics == 'hamming': + return list(map(lambda x:1 / (1 + x), score)) + elif metrics == 'dot': + pass + @property def size(self): return self._index.get_n_items() diff --git a/gnes/indexer/vector/bindexer/__init__.py b/gnes/indexer/vector/bindexer/__init__.py index 85802851..26c47454 100644 --- a/gnes/indexer/vector/bindexer/__init__.py +++ b/gnes/indexer/vector/bindexer/__init__.py @@ -74,7 +74,6 @@ def add(self, keys: List[Tuple[int, int]], vectors: np.ndarray, weights: List[fl def query(self, keys: np.ndarray, top_k: int, - normalized_score: bool = True, method: str = 'nsw', *args, **kwargs) -> List[List[Tuple]]: @@ -93,7 +92,7 @@ def query(self, q_idx, doc_ids, offsets, weights = self.bindexer.find_batch_trie( keys, num_rows) for (i, q, o, w) in zip(doc_ids, q_idx, offsets, weights): - result[q].append((i, o, w / self._weight_norm, 1 if normalized_score else self.num_bytes)) + result[q].append((i, o, w / self._weight_norm, 1)) # search the indexed items with similar value doc_ids, offsets, weights, dists, q_idx = self.bindexer.nsw_search( @@ -101,9 +100,7 @@ def query(self, for (i, o, w, d, q) in zip(doc_ids, offsets, weights, dists, q_idx): if d == 0: continue - result[q].append( - (i, o, w / self._weight_norm, - (1. - d / self.num_bytes) if normalized_score else self.num_bytes - d)) + result[q].append((i, o, w / self._weight_norm, self.normalize_score(d))) # get the top-k for q in range(num_rows): @@ -112,11 +109,12 @@ def query(self, doc_ids, offsets, weights, dists, q_idx = self.bindexer.force_search( keys, num_rows, top_k) for (i, o, w, d, q) in zip(doc_ids, offsets, weights, dists, q_idx): - result[q].append( - (i, o, w / self._weight_norm, - (1. - d / self.num_bytes) if normalized_score else self.num_bytes - d)) + result[q].append((i, o, w / self._weight_norm, self.normalize_score(d))) return result + def normalize_score(self, distance: int, *args) -> float: + return 1. - distance / self.num_bytes + def __getstate__(self): self.bindexer.save(self.data_path) d = super().__getstate__() diff --git a/gnes/indexer/vector/faiss.py b/gnes/indexer/vector/faiss.py index 545fe377..134dcb68 100644 --- a/gnes/indexer/vector/faiss.py +++ b/gnes/indexer/vector/faiss.py @@ -61,16 +61,23 @@ def query(self, keys: np.ndarray, top_k: int, *args, **kwargs) -> List[List[Tupl raise ValueError("vectors should be ndarray of float32") score, ids = self._faiss_index.search(keys, top_k) + score = self.normalize_score(score) ret = [] for _id, _score in zip(ids, score): ret_i = [] chunk_info = self._key_info_indexer.query(_id) for c_info, _score_i in zip(chunk_info, _score): - ret_i.append((*c_info, -_score_i)) + ret_i.append((*c_info, _score_i)) ret.append(ret_i) return ret + def normalize_score(self, score: np.ndarray, *args) -> np.ndarray: + if 'HNSW' in self.index_key: + return 1 / (1 + np.sqrt(score) / self.num_dim) + elif 'PQ' or 'Flat' in self.index_key: + return 1 / (1 + np.abs(np.sqrt(score))) + @property def size(self): return self._faiss_index.ntotal diff --git a/gnes/indexer/vector/hbindexer/__init__.py b/gnes/indexer/vector/hbindexer/__init__.py index 16fe2f9f..73e6dac8 100644 --- a/gnes/indexer/vector/hbindexer/__init__.py +++ b/gnes/indexer/vector/hbindexer/__init__.py @@ -72,7 +72,6 @@ def add(self, keys: List[Tuple[int, int]], vectors: np.ndarray, weights: List[fl def query(self, vectors: np.ndarray, top_k: int, - normalized_score: bool = True, *args, **kwargs) -> List[List[Tuple]]: @@ -89,11 +88,13 @@ def query(self, doc_ids, offsets, weights, dists, q_idx = self.hbindexer.query( vectors, clusters, n, top_k * self.n_idx) for (i, o, w, d, q) in zip(doc_ids, offsets, weights, dists, q_idx): - result[q][(i, o, w / self._weight_norm)] = ( - 1. - d / self.n_bytes * 8) if normalized_score else self.n_bytes * 8 - d + result[q][(i, o, w / self._weight_norm)] = self.normalize_score(d) return [sorted(ret.items(), key=lambda x: -x[1])[:top_k] for ret in result] + def normalize_score(self, distance: int, *args) -> float: + return 1. - distance / self.n_bytes * 8 + def __getstate__(self): self.hbindexer.save(self.data_path) d = super().__getstate__() diff --git a/gnes/service/indexer.py b/gnes/service/indexer.py index 323fa8a3..1b5bd3da 100644 --- a/gnes/service/indexer.py +++ b/gnes/service/indexer.py @@ -66,7 +66,7 @@ def _handler_chunk_search(self, msg: 'gnes_pb2.Message'): r.chunk.doc_id = _doc_id r.chunk.offset_1d = _offset r.chunk.weight = _weight - r.score = _weight * qc_weight * (-1 / _relevance) + r.score = _weight * qc_weight * _relevance r.score_explained = '[chunk_score at doc: %d, offset: %d] = ' \ '(doc_chunk_weight: %.6f) * ' \ '(query_doc_chunk_relevance: %.6f) * ' \ diff --git a/tests/test_bindexer.py b/tests/test_bindexer.py index 82973425..def921e0 100644 --- a/tests/test_bindexer.py +++ b/tests/test_bindexer.py @@ -21,8 +21,8 @@ def setUp(self): [2, 1, 3, 4], [3, 2, 1, 2]]).astype(np.uint8) - self.toy_exp = [[(234, 0, 1., 4,), (123, 1, 1., 4)], [(432, 0, 1., 4), (1, 0, 1., 4)], - [(234, 0, 1., 3), (123, 1, 1., 3)]] + self.toy_exp = [[(234, 0, 1., 1,), (123, 1, 1., 1)], [(432, 0, 1., 1), (1, 0, 1., 1)], + [(234, 0, 1., 0.75), (123, 1, 1., 0.75)]] self.weights = [1.] * len(self.toy_label) dirname = os.path.dirname(__file__)