From 98bef2b9b6b958b26293faacd5f9b046bfb7f562 Mon Sep 17 00:00:00 2001 From: hanhxiao Date: Tue, 3 Sep 2019 16:50:34 +0800 Subject: [PATCH 1/6] feat(score_fn): add score_fn as a new module --- gnes/indexer/base.py | 119 +++------------------ gnes/indexer/chunk/annoy.py | 23 ++--- gnes/indexer/chunk/bindexer/__init__.py | 10 +- gnes/indexer/chunk/faiss.py | 14 +-- gnes/indexer/chunk/hbindexer/__init__.py | 11 +- gnes/router/base.py | 31 ++---- gnes/score_fn/__init__.py | 0 gnes/score_fn/base.py | 125 +++++++++++++++++++++++ gnes/score_fn/chunk.py | 67 ++++++++++++ gnes/score_fn/doc.py | 11 ++ gnes/score_fn/normalize.py | 48 +++++++++ tests/test_annoyindexer.py | 1 + tests/test_router.py | 6 +- tests/test_score_fn.py | 69 +++++++++++++ 14 files changed, 371 insertions(+), 164 deletions(-) create mode 100644 gnes/score_fn/__init__.py create mode 100644 gnes/score_fn/base.py create mode 100644 gnes/score_fn/chunk.py create mode 100644 gnes/score_fn/doc.py create mode 100644 gnes/score_fn/normalize.py create mode 100644 tests/test_score_fn.py diff --git a/gnes/indexer/base.py b/gnes/indexer/base.py index c1ddc423..6bc6b7aa 100644 --- a/gnes/indexer/base.py +++ b/gnes/indexer/base.py @@ -12,16 +12,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import json from typing import List, Any, Union, Callable, Tuple import numpy as np from ..base import TrainableBase, CompositionalTrainableBase from ..proto import gnes_pb2, blob2array +from ..score_fn.base import get_unary_score, ModifierFn class BaseIndexer(TrainableBase): + normalize_fn = ModifierFn() + score_fn = ModifierFn() def add(self, keys: Any, docs: Any, weights: List[float], *args, **kwargs): pass @@ -29,16 +31,10 @@ def add(self, keys: Any, docs: Any, weights: List[float], *args, **kwargs): def query(self, keys: Any, *args, **kwargs) -> List[Any]: pass - def normalize_score(self, *args, **kwargs): - pass - def query_and_score(self, q_chunks: List[Union['gnes_pb2.Chunk', 'gnes_pb2.Document']], top_k: int) -> List[ 'gnes_pb2.Response.QueryResponse.ScoredResult']: raise NotImplementedError - def score(self, *args, **kwargs) -> 'gnes_pb2.Response.QueryResponse.ScoredResult.Score': - raise NotImplementedError - class BaseChunkIndexer(BaseIndexer): @@ -59,13 +55,16 @@ def query_and_score(self, q_chunks: List['gnes_pb2.Chunk'], top_k: int, *args, * r.chunk.doc_id = _doc_id r.chunk.offset = _offset r.chunk.weight = _weight - r.score.CopyFrom(self.score(q_chunk, r.chunk, _relevance)) + _score = get_unary_score(value=_relevance, name=self.__class__.__name__) + _score = self.normalize_fn(_score) + _score = self.score_fn(_score, q_chunk, r.chunk) + r.score.CopyFrom(_score) results.append(r) return results - def score(self, q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk', - relevance) -> 'gnes_pb2.Response.QueryResponse.ScoredResult.Score': - return ChunkScorer.eq1(q_chunk, d_chunk, relevance) + # def score(self, q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk', + # relevance, relevance_cls) -> 'gnes_pb2.Response.QueryResponse.ScoredResult.Score': + # return ChunkScorer.eq1(q_chunk, d_chunk, relevance, relevance_cls) class BaseDocIndexer(BaseIndexer): @@ -84,14 +83,12 @@ def query_and_score(self, docs: List['gnes_pb2.Response.QueryResponse.ScoredResu for d, r in zip(queried_results, docs): if d: r.doc.CopyFrom(d) - r.score.CopyFrom(self.score(d, r.score)) + _score = self.normalize_fn(r.score) + _score = self.score_fn(_score, d) + r.score.CopyFrom(_score) results.append(r) return results - def score(self, d: 'gnes_pb2.Document', s: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score', *args, - **kwargs) -> 'gnes_pb2.Response.QueryResponse.ScoredResult.Score': - return DocScorer.eq1(d, s) - class BaseKeyIndexer(BaseIndexer): @@ -102,96 +99,6 @@ def query(self, keys: List[int], *args, **kwargs) -> List[Tuple[int, int, float] pass -class ChunkScorer: - - @staticmethod - def eq1(q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk', - relevance): - """ - score = d_chunk.weight * relevance * q_chunk.weight - """ - score = gnes_pb2.Response.QueryResponse.ScoredResult.Score() - score.value = d_chunk.weight * relevance * q_chunk.weight - score.explained = json.dumps({ - 'name': 'chunk-eq1', - 'operand': [{'name': 'd_chunk_weight', - 'value': float(d_chunk.weight), - 'doc_id': d_chunk.doc_id, - 'offset': d_chunk.offset}, - {'name': 'q_chunk_weight', - 'value': float(q_chunk.weight), - 'offset': q_chunk.offset}, - {'name': 'relevance', - 'value': float(relevance)}], - 'op': 'prod', - 'value': float(score.value) - }) - return score - - @staticmethod - def eq2(q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk', - relevance): - """ - score = d_chunk.weight * relevance * offset_divergence * q_chunk.weight - offset_divergence is calculated based on doc_type: - TEXT && VIDEO && AUDIO: offset is 1-D - IMAGE: offset is 2-D - """ - - def _cal_divergence(q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk'): - if q_chunk.offset_nd and d_chunk.offset_nd: - return 1 / (1 + np.sqrt((q_chunk.offset_nd[0] - d_chunk.offset_nd[0]) ** 2 + - (q_chunk.offset_nd[1] - d_chunk.offset_nd[1]) ** 2)) - else: - return np.abs(q_chunk.offset - d_chunk.offset) - - score = gnes_pb2.Response.QueryResponse.ScoredResult.Score() - - divergence = _cal_divergence(q_chunk, d_chunk) - score.value = d_chunk.weight * relevance * divergence * q_chunk.weight - score.explained = json.dumps({ - 'name': 'chunk-eq2', - 'operand': [{'name': 'd_chunk_weight', - 'value': float(d_chunk.weight), - 'doc_id': d_chunk.doc_id, - 'offset': d_chunk.offset}, - {'name': 'q_chunk_weight', - 'value': float(q_chunk.weight), - 'offset': q_chunk.offset}, - {'name': 'relevance', - 'value': float(relevance)}, - {'name': 'offset_divergence', - 'value': float(divergence)}], - 'op': 'prod', - 'value': float(score.value) - }) - return score - - -class DocScorer: - - @staticmethod - def eq1(d: 'gnes_pb2.Document', - s: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score') -> 'gnes_pb2.Response.QueryResponse.ScoredResult.Score': - """ - score *= d.weight - :param d: - :param s: - :return: - """ - s.value *= d.weight - s.explained = json.dumps({ - 'name': 'doc-eq1', - 'operand': [json.loads(s.explained), - {'name': 'doc_weight', - 'value': float(d.weight), - 'doc_id': d.doc_id}], - 'op': 'prod', - 'value': float(s.value) - }) - return s - - class JointIndexer(CompositionalTrainableBase): @property diff --git a/gnes/indexer/chunk/annoy.py b/gnes/indexer/chunk/annoy.py index 6ec76a69..b9f686c5 100644 --- a/gnes/indexer/chunk/annoy.py +++ b/gnes/indexer/chunk/annoy.py @@ -20,6 +20,8 @@ from ..base import BaseChunkIndexer from ..key_only import ListKeyIndexer +from ...score_fn.base import ScoreOps +from ...score_fn.normalize import Normalizer3, Normalizer2 class AnnoyIndexer(BaseChunkIndexer): @@ -44,6 +46,13 @@ def post_init(self): except: self.logger.warning('fail to load model from %s, will create an empty one' % self.data_path) + if self.metric in {'angular', 'hamming'}: + self.normalize_fn = ScoreOps.reciprocal1p + elif self.metric == 'euclidean': + self.normalize_fn = Normalizer3(self.num_dim) + elif self.metric == 'manhattan': + self.normalize_fn = Normalizer2(self.num_dim) + def add(self, keys: List[Tuple[int, Any]], vectors: np.ndarray, weights: List[float], *args, **kwargs): last_idx = self._key_info_indexer.size @@ -65,24 +74,10 @@ def query(self, keys: 'np.ndarray', top_k: int, *args, **kwargs) -> List[List[Tu res = [] for k in keys: ret, relevance_score = self._index.get_nns_by_vector(k, top_k, include_distances=True) - relevance_score = self.normalize_score(relevance_score, self.metric) chunk_info = self._key_info_indexer.query(ret) res.append([(*r, s) for r, s in zip(chunk_info, relevance_score)]) return res - def normalize_score(self, score: List[float], metrics: str, *args, **kwargs) -> List[float]: - if metrics == 'angular': - return list(map(lambda x: 1 / (1 + x), score)) - elif metrics == 'euclidean': - import math - return list(map(lambda x: 1 / (1 + math.sqrt(x) / self.num_dim), score)) - elif metrics == 'manhattan': - return list(map(lambda x: 1 / (1 + x / self.num_dim), score)) - elif metrics == 'hamming': - return list(map(lambda x: 1 / (1 + x), score)) - elif metrics == 'dot': - raise NotImplementedError - @property def size(self): return self._index.get_n_items() diff --git a/gnes/indexer/chunk/bindexer/__init__.py b/gnes/indexer/chunk/bindexer/__init__.py index cfc4fe32..9849e43b 100644 --- a/gnes/indexer/chunk/bindexer/__init__.py +++ b/gnes/indexer/chunk/bindexer/__init__.py @@ -21,6 +21,7 @@ from .cython import IndexCore from ...base import BaseChunkIndexer +from ....score_fn.normalize import Normalizer4 class BIndexer(BaseChunkIndexer): @@ -55,6 +56,8 @@ def post_init(self): except (FileNotFoundError, IsADirectoryError): self.logger.warning('fail to load model from %s, will create an empty one' % self.data_path) + self.normalize_fn = Normalizer4(self.num_bytes) + def add(self, keys: List[Tuple[int, Any]], vectors: np.ndarray, weights: List[float], *args, **kwargs): if len(vectors) != len(keys): @@ -99,7 +102,7 @@ def query(self, for (i, o, w, d, q) in zip(doc_ids, offsets, weights, dists, q_idx): if d == 0: continue - result[q].append((i, o, w / self._weight_norm, self.normalize_score(d))) + result[q].append((i, o, w / self._weight_norm, d)) # get the top-k for q in range(num_rows): @@ -108,12 +111,9 @@ def query(self, doc_ids, offsets, weights, dists, q_idx = self.bindexer.force_search( keys, num_rows, top_k) for (i, o, w, d, q) in zip(doc_ids, offsets, weights, dists, q_idx): - result[q].append((i, o, w / self._weight_norm, self.normalize_score(d))) + result[q].append((i, o, w / self._weight_norm, d)) return result - def normalize_score(self, distance: int, *args, **kwargs) -> float: - return 1. - distance / self.num_bytes - def __getstate__(self): self.bindexer.save(self.data_path) d = super().__getstate__() diff --git a/gnes/indexer/chunk/faiss.py b/gnes/indexer/chunk/faiss.py index 09539542..3b086960 100644 --- a/gnes/indexer/chunk/faiss.py +++ b/gnes/indexer/chunk/faiss.py @@ -44,6 +44,13 @@ def post_init(self): self.logger.warning('fail to load model from %s, will init an empty one' % self.data_path) self._faiss_index = faiss.index_factory(self.num_dim, self.index_key) + if 'HNSW' in self.index_key: + from ...score_fn.normalize import Normalizer3 + self.normalize_fn = Normalizer3(self.num_dim) + elif ('Flat' in self.index_key) or ('PQ' in self.index_key): + from ...score_fn.normalize import Normalizer5 + self.normalize_fn = Normalizer5(self.num_dim) + def add(self, keys: List[Tuple[int, Any]], vectors: np.ndarray, weights: List[float], *args, **kwargs): if len(vectors) != len(keys): raise ValueError("vectors length should be equal to doc_ids") @@ -59,7 +66,6 @@ def query(self, keys: np.ndarray, top_k: int, *args, **kwargs) -> List[List[Tupl raise ValueError("vectors should be ndarray of float32") score, ids = self._faiss_index.search(keys, top_k) - score = self.normalize_score(score) ret = [] for _id, _score in zip(ids, score): ret_i = [] @@ -70,12 +76,6 @@ def query(self, keys: np.ndarray, top_k: int, *args, **kwargs) -> List[List[Tupl return ret - def normalize_score(self, score: np.ndarray, *args, **kwargs) -> np.ndarray: - if 'HNSW' in self.index_key: - return 1 / (1 + np.sqrt(score) / self.num_dim) - elif 'PQ' or 'Flat' in self.index_key: - return 1 / (1 + np.abs(np.sqrt(score))) - @property def size(self): return self._faiss_index.ntotal diff --git a/gnes/indexer/chunk/hbindexer/__init__.py b/gnes/indexer/chunk/hbindexer/__init__.py index efa42fcd..77e08c42 100644 --- a/gnes/indexer/chunk/hbindexer/__init__.py +++ b/gnes/indexer/chunk/hbindexer/__init__.py @@ -21,6 +21,7 @@ from .cython import IndexCore from ...base import BaseChunkIndexer +from ....score_fn.normalize import Normalizer4 class HBIndexer(BaseChunkIndexer): @@ -41,6 +42,7 @@ def __init__(self, if self.n_idx <= 0: raise ValueError('There should be at least 1 clustering slot') + def post_init(self): self.hbindexer = IndexCore(self.n_clusters, self.n_bytes, self.n_idx) try: @@ -52,6 +54,8 @@ def post_init(self): except (FileNotFoundError, IsADirectoryError): self.logger.warning('fail to load model from %s, will create an empty one' % self.data_path) + self.normalize_fn = Normalizer4(self.n_bytes * 8) + def add(self, keys: List[Tuple[int, Any]], vectors: np.ndarray, weights: List[float], *args, **kwargs): if len(vectors) != len(keys): raise ValueError("vectors length should be equal to doc_ids") @@ -87,12 +91,9 @@ def query(self, doc_ids, offsets, weights, dists, q_idx = self.hbindexer.query( vectors, clusters, n, top_k * self.n_idx) for (i, o, w, d, q) in zip(doc_ids, offsets, weights, dists, q_idx): - result[q][(i, o, w / self._weight_norm)] = self.normalize_score(d) - - return [sorted(ret.items(), key=lambda x: -x[1])[:top_k] for ret in result] + result[q][(i, o, w / self._weight_norm)] = d - def normalize_score(self, distance: int, *args, **kwargs) -> float: - return 1. - distance / self.n_bytes * 8 + return [list(ret.items()) for ret in result] def __getstate__(self): self.hbindexer.save(self.data_path) diff --git a/gnes/router/base.py b/gnes/router/base.py index 092973d8..6a9b9691 100644 --- a/gnes/router/base.py +++ b/gnes/router/base.py @@ -12,12 +12,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import json -from _operator import add, mul from collections import defaultdict -from functools import reduce from typing import List, Generator +from gnes.score_fn.base import ScoreCombinedFn from ..base import TrainableBase, CompositionalTrainableBase from ..proto import gnes_pb2, merge_routes @@ -63,19 +61,11 @@ def apply(self, msg: 'gnes_pb2.Message', accum_msgs: List['gnes_pb2.Message'], * class BaseTopkReduceRouter(BaseReduceRouter): def __init__(self, reduce_op: str = 'sum', descending: bool = True, *args, **kwargs): super().__init__(*args, **kwargs) - if reduce_op not in {'sum', 'prod', 'max', 'min', 'avg'}: - raise ValueError('reduce_op=%s is not acceptable' % reduce_op) self._reduce_op = reduce_op self.descending = descending def post_init(self): - self.reduce_op = { - 'prod': lambda v: reduce(mul, v), - 'sum': lambda v: reduce(add, v), - 'max': lambda v: reduce(max, v), - 'min': lambda v: reduce(min, v), - 'avg': lambda v: reduce(add, v) / len(v), - }[self._reduce_op] + self.reduce_op = ScoreCombinedFn(score_mode=self._reduce_op) def get_key(self, x: 'gnes_pb2.Response.QueryResponse.ScoredResult') -> str: raise NotImplementedError @@ -86,29 +76,22 @@ def set_key(self, x: 'gnes_pb2.Response.QueryResponse.ScoredResult', k: str) -> def apply(self, msg: 'gnes_pb2.Message', accum_msgs: List['gnes_pb2.Message'], *args, **kwargs): # now convert chunk results to doc results all_scored_results = [sr for m in accum_msgs for sr in m.response.search.topk_results] - score_dict = defaultdict(lambda: {'values': [], 'explains': [], 'reduced_value': 0}) + score_dict = defaultdict(list) # count score by iterating over chunks for c in all_scored_results: k = self.get_key(c) - score_dict[k]['values'].append(c.score.value) - score_dict[k]['explains'].append(c.score.explained) + score_dict[k].append(c.score) for k, v in score_dict.items(): - score_dict[k]['reduced_value'] = self.reduce_op(v['values']) + score_dict[k] = self.reduce_op(*v) msg.response.search.ClearField('topk_results') # sort and add docs - for k, v in sorted(score_dict.items(), key=lambda kv: kv[1]['reduced_value'] * (-1 if self.descending else 1)): + for k, v in sorted(score_dict.items(), key=lambda kv: kv[1].value, reverse=self.descending): r = msg.response.search.topk_results.add() - r.score.value = v['reduced_value'] - r.score.explained = json.dumps({ - 'name': 'topk-reduce', - 'op': self._reduce_op, - 'operand': [json.loads(vv) for vv in v['explains']], - 'value': float(r.score.value) - }) + r.score.CopyFrom(v) self.set_key(r, k) super().apply(msg, accum_msgs) diff --git a/gnes/score_fn/__init__.py b/gnes/score_fn/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/gnes/score_fn/base.py b/gnes/score_fn/base.py new file mode 100644 index 00000000..92638c29 --- /dev/null +++ b/gnes/score_fn/base.py @@ -0,0 +1,125 @@ +import json +from functools import reduce +from math import log, log1p, log10, sqrt +from operator import mul, add +from typing import Sequence + +from ..proto import gnes_pb2 + + +def get_unary_score(value: float, **kwargs): + score = gnes_pb2.Response.QueryResponse.ScoredResult.Score() + score.value = value + score.explained = json.dumps( + dict(value=float(value), + **kwargs)) + return score + + +class BaseScoreFn: + + def __call__(self, *args, **kwargs) -> 'gnes_pb2.Response.QueryResponse.ScoredResult.Score': + raise NotImplementedError + + def new_score(self, *, operands: Sequence['gnes_pb2.Response.QueryResponse.ScoredResult.Score'] = (), **kwargs): + if not self.__doc__: + raise NotImplementedError('%s dont have docstring. For the sake of interpretability, ' + 'please write docstring for this class') + return get_unary_score(name=self.__class__.__name__, + docstring=' '.join(self.__doc__.split()).strip(), + operands=[json.loads(s.explained) for s in operands], + **kwargs) + + def op(self, *args, **kwargs) -> float: + raise NotImplementedError + + +class ScoreCombinedFn(BaseScoreFn): + """Combine multiple scores into one score, defaults to 'multiply'""" + + def __init__(self, score_mode: str = 'multiply'): + """ + :param score_mode: specifies how the computed scores are combined + """ + if score_mode not in {'multiply', 'sum', 'avg', 'max', 'min'}: + raise AttributeError('score_mode=%s is not supported!' % score_mode) + self.score_mode = score_mode + + def __call__(self, *last_scores) -> 'gnes_pb2.Response.QueryResponse.ScoredResult.Score': + return self.new_score( + value=self.op(s.value for s in last_scores), + operands=last_scores, + score_mode=self.score_mode) + + def op(self, *args, **kwargs) -> float: + return { + 'multiply': lambda v: reduce(mul, v), + 'sum': lambda v: reduce(add, v), + 'max': lambda v: reduce(max, v), + 'min': lambda v: reduce(min, v), + 'avg': lambda v: reduce(add, v) / len(v), + }[self.score_mode](*args, **kwargs) + + +class ModifierFn(BaseScoreFn): + """Modifier to apply to the value + score = modifier(factor * value) + """ + + def __init__(self, modifier: str = 'none', factor: float = 1.0): + if modifier not in {'none', 'log', 'log1p', 'log2p', 'ln', 'ln1p', 'ln2p', 'square', 'sqrt', 'reciprocal', + 'reciprocal1p', 'abs'}: + raise AttributeError('modifier=%s is not supported!' % modifier) + self.modifier = modifier + self.factor = get_unary_score(factor) + + def op(self, *args, **kwargs) -> float: + return { + 'none': lambda x: x, + 'log': log10, + 'log1p': lambda x: log(x + 1, 10), + 'log2p': lambda x: log(x + 2, 10), + 'ln': log, + 'ln1p': log1p, + 'ln2p': lambda x: log(x + 2), + 'square': lambda x: x * x, + 'sqrt': sqrt, + 'reciprocal': lambda x: 1 / x, + 'reciprocal1p': lambda x: 1 / (1 + x), + 'abs': abs, + 'invert': lambda x: - x, + 'invert1p': lambda x: 1 - x + }[self.modifier](*args, **kwargs) + + def __call__(self, + last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score', + *args, **kwargs) -> \ + 'gnes_pb2.Response.QueryResponse.ScoredResult.Score': + if self.modifier == 'none' and self.factor.value == 1.0: + return last_score + else: + return self.new_score( + value=self.op(self.factor.value * last_score.value), + operands=[last_score], + modifier=self.modifier, + factor=json.loads(self.factor.explained)) + + +class ScoreOps: + multiply = ScoreCombinedFn('multiply') + sum = ScoreCombinedFn('sum') + max = ScoreCombinedFn('max') + min = ScoreCombinedFn('min') + avg = ScoreCombinedFn('avg') + none = ModifierFn('none') + log = ModifierFn('log') + log1p = ModifierFn('log1p') + log2p = ModifierFn('log2p') + ln = ModifierFn('ln') + ln1p = ModifierFn('ln1p') + ln2p = ModifierFn('ln2p') + square = ModifierFn('square') + sqrt = ModifierFn('sqrt') + abs = ModifierFn('abs') + reciprocal = ModifierFn('reciprocal') + reciprocal1p = ModifierFn('reciprocal1p') diff --git a/gnes/score_fn/chunk.py b/gnes/score_fn/chunk.py new file mode 100644 index 00000000..d0cc5798 --- /dev/null +++ b/gnes/score_fn/chunk.py @@ -0,0 +1,67 @@ +from .base import get_unary_score, ScoreCombinedFn +from ..proto import gnes_pb2 + + +class WeightedChunkScoreFn(ScoreCombinedFn): + """score = d_chunk.weight * relevance * q_chunk.weight""" + + def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score', + q_chunk: 'gnes_pb2.Chunk', + d_chunk: 'gnes_pb2.Chunk', *args, **kwargs): + q_chunk_weight = get_unary_score(value=q_chunk.weight, + name='query chunk weight', + offset=q_chunk.offset) + d_chunk_weight = get_unary_score(value=d_chunk.weight, + name='document chunk weight', + doc_id=d_chunk.doc_id, + offset=d_chunk.offset) + + return super().__call__(last_score, q_chunk_weight, d_chunk_weight) + +# TODO: write this as a class +# @staticmethod +# def eq2(q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk', +# relevance, relevance_cls): +# """ +# score = d_chunk.weight * relevance * offset_divergence * q_chunk.weight +# offset_divergence is calculated based on doc_type: +# TEXT && VIDEO && AUDIO: offset is 1-D +# IMAGE: offset is 2-D +# """ +# +# def _cal_divergence(q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk'): +# if q_chunk.offset_nd and d_chunk.offset_nd: +# return 1 / (1 + np.sqrt((q_chunk.offset_nd[0] - d_chunk.offset_nd[0]) ** 2 + +# (q_chunk.offset_nd[1] - d_chunk.offset_nd[1]) ** 2)) +# else: +# return np.abs(q_chunk.offset - d_chunk.offset) +# +# score = gnes_pb2.Response.QueryResponse.ScoredResult.Score() +# +# divergence = _cal_divergence(q_chunk, d_chunk) +# score.value = d_chunk.weight * relevance * divergence * q_chunk.weight +# score.explained = json.dumps({ +# 'name': 'chunk_scorer_eq2', +# 'operand': [{'name': 'd_chunk_weight', +# 'value': float(d_chunk.weight), +# 'doc_id': d_chunk.doc_id, +# 'offset': d_chunk.offset}, +# {'name': 'q_chunk_weight', +# 'value': float(q_chunk.weight), +# 'offset': q_chunk.offset}, +# {'name': 'relevance', +# 'op': relevance_cls, +# 'operand': [{'name': 'doc_chunk', +# 'doc_id': d_chunk.doc_id, +# 'offset': d_chunk.offset}, +# {'name': 'query_chunk', +# 'offset': q_chunk.offset} +# ], +# 'value': relevance +# }, +# {'name': 'offset_divergence', +# 'value': float(divergence)}], +# 'op': 'prod', +# 'value': float(score.value) +# }) +# return score diff --git a/gnes/score_fn/doc.py b/gnes/score_fn/doc.py new file mode 100644 index 00000000..ef120e76 --- /dev/null +++ b/gnes/score_fn/doc.py @@ -0,0 +1,11 @@ +from .base import get_unary_score, ScoreCombinedFn +from ..proto import gnes_pb2 + + +class WeightedDocScoreFn(ScoreCombinedFn): + def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score', + doc: 'gnes_pb2.Document', *args, **kwargs): + d_weight = get_unary_score(value=doc.weight, + name='doc weight', + doc_id=doc.doc_id) + return super().__call__(last_score, d_weight) diff --git a/gnes/score_fn/normalize.py b/gnes/score_fn/normalize.py new file mode 100644 index 00000000..6f0cfca8 --- /dev/null +++ b/gnes/score_fn/normalize.py @@ -0,0 +1,48 @@ +from .base import ModifierFn, ScoreOps as so, get_unary_score + + +class Normalizer1(ModifierFn): + """Do normalizing: score = 1 / (1 + sqrt(score))""" + + def __init__(self): + super().__init__() + self.modifier = 'reciprocal1p' + + def __call__(self, last_score, *args, **kwargs): + return super().__call__(so.sqrt(last_score)) + + +class Normalizer2(ModifierFn): + """Do normalizing: score = 1 / (1 + score / num_dim)""" + + def __init__(self, num_dim: int): + super().__init__() + self.modifier = 'reciprocal1p' + self.factor = so.reciprocal(get_unary_score(value=num_dim, name='GivenConstant')) + + +class Normalizer3(Normalizer2): + """Do normalizing: score = 1 / (1 + sqrt(score) / num_dim)""" + + def __call__(self, last_score, *args, **kwargs): + return super().__call__(so.sqrt(last_score)) + + +class Normalizer4(ModifierFn): + """Do normalizing: score = 1 - score / num_bytes """ + + def __init__(self, num_bytes: int): + super().__init__() + self.modifier = 'invert1p' + self.factor = so.reciprocal(get_unary_score(value=num_bytes, name='GivenConstant')) + + +class Normalizer5(ModifierFn): + """Do normalizing: score = 1 / (1 + sqrt(abs(score)))""" + + def __init__(self, num_dim: int): + super().__init__() + self.modifier = 'reciprocal1p' + + def __call__(self, last_score, *args, **kwargs): + return super().__call__(so.sqrt(so.abs(last_score))) diff --git a/tests/test_annoyindexer.py b/tests/test_annoyindexer.py index c4ee2894..fdcb1a00 100644 --- a/tests/test_annoyindexer.py +++ b/tests/test_annoyindexer.py @@ -24,3 +24,4 @@ def test_search(self): top_1 = [i[0][0] for i in a.query(self.toy_data, top_k=1)] self.assertEqual(top_1, list(range(10))) a.close() + a.dump() diff --git a/tests/test_router.py b/tests/test_router.py index 60676870..f3adfd1f 100644 --- a/tests/test_router.py +++ b/tests/test_router.py @@ -142,11 +142,11 @@ def test_chunk_reduce_router(self): self.assertGreaterEqual(r.response.search.topk_results[0].score.value, r.response.search.topk_results[-1].score.value) print(r.response.search.topk_results) - self.assertEqual(json.loads(r.response.search.topk_results[0].score.explained)['operand'], + self.assertEqual(json.loads(r.response.search.topk_results[0].score.explained)['operands'], ['1-c1', '1-c3', '2-c1']) - self.assertEqual(json.loads(r.response.search.topk_results[1].score.explained)['operand'], + self.assertEqual(json.loads(r.response.search.topk_results[1].score.explained)['operands'], ['1-c2', '2-c2']) - self.assertEqual(json.loads(r.response.search.topk_results[2].score.explained)['operand'], ['2-c3']) + self.assertEqual(json.loads(r.response.search.topk_results[2].score.explained)['operands'], ['2-c3']) self.assertAlmostEqual(r.response.search.topk_results[0].score.value, 0.6) self.assertAlmostEqual(r.response.search.topk_results[1].score.value, 0.4) diff --git a/tests/test_score_fn.py b/tests/test_score_fn.py new file mode 100644 index 00000000..99526a1d --- /dev/null +++ b/tests/test_score_fn.py @@ -0,0 +1,69 @@ +import json +import unittest +from pprint import pprint + +from gnes.proto import gnes_pb2 +from gnes.score_fn.base import get_unary_score, ScoreCombinedFn, ModifierFn +from gnes.score_fn.chunk import WeightedChunkScoreFn +from gnes.score_fn.normalize import Normalizer1, Normalizer2, Normalizer3, Normalizer4 + + +class TestScoreFn(unittest.TestCase): + def test_basic(self): + a = get_unary_score(0.5) + b = get_unary_score(0.7) + print(a) + print(b.explained) + + def test_op(self): + a = get_unary_score(0.5) + b = get_unary_score(0.7) + sum_op = ScoreCombinedFn(score_mode='sum') + c = sum_op(a, b) + self.assertAlmostEqual(c.value, 1.2) + + sq_op = ModifierFn(modifier='square') + c = sum_op(a, sq_op(b)) + self.assertAlmostEqual(c.value, 0.99) + print(c) + + def test_normalizer(self): + a = get_unary_score(0.5) + norm_op = Normalizer1() + b = norm_op(a) + pprint(json.loads(b.explained)) + + a = get_unary_score(0.5) + norm_op = Normalizer2(2) + b = norm_op(a) + pprint(json.loads(b.explained)) + self.assertAlmostEqual(b.value, 0.8) + + a = get_unary_score(0.5) + norm_op = Normalizer3(2) + b = norm_op(a) + pprint(json.loads(b.explained)) + self.assertAlmostEqual(b.value, 0.7387961283389092) + + a = get_unary_score(0.5) + norm_op = Normalizer4(2) + b = norm_op(a) + pprint(json.loads(b.explained)) + self.assertEqual(b.value, 0.75) + + norm_op = ModifierFn('none') + b = norm_op(a) + pprint(json.loads(b.explained)) + self.assertEqual(b.value, 0.5) + + q_chunk = gnes_pb2.Chunk() + q_chunk.weight = 0.5 + q_chunk.offset = 1 + d_chunk = gnes_pb2.Chunk() + d_chunk.weight = 0.7 + d_chunk.offset = 2 + rel_score = get_unary_score(2) + _op = WeightedChunkScoreFn() + c = _op(rel_score, q_chunk, d_chunk) + pprint(json.loads(c.explained)) + self.assertAlmostEqual(c.value, 0.7) From 0b78798d5e6dc0b1b4df9ffd547b611ad837a467 Mon Sep 17 00:00:00 2001 From: hanhxiao Date: Tue, 3 Sep 2019 16:53:58 +0800 Subject: [PATCH 2/6] feat(score_fn): add score_fn as a new module --- gnes/indexer/base.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/gnes/indexer/base.py b/gnes/indexer/base.py index 6bc6b7aa..632f093d 100644 --- a/gnes/indexer/base.py +++ b/gnes/indexer/base.py @@ -62,10 +62,6 @@ def query_and_score(self, q_chunks: List['gnes_pb2.Chunk'], top_k: int, *args, * results.append(r) return results - # def score(self, q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk', - # relevance, relevance_cls) -> 'gnes_pb2.Response.QueryResponse.ScoredResult.Score': - # return ChunkScorer.eq1(q_chunk, d_chunk, relevance, relevance_cls) - class BaseDocIndexer(BaseIndexer): From 14c7e52261b80c8ddde6f760fbe6857ca2dc6c55 Mon Sep 17 00:00:00 2001 From: hanhxiao Date: Tue, 3 Sep 2019 17:38:02 +0800 Subject: [PATCH 3/6] feat(score_fn): make score_fn dumpable --- gnes/indexer/chunk/annoy.py | 13 +++++++------ gnes/score_fn/base.py | 17 +++++++++++------ gnes/score_fn/chunk.py | 1 - gnes/score_fn/doc.py | 1 - gnes/score_fn/normalize.py | 6 +++--- 5 files changed, 21 insertions(+), 17 deletions(-) diff --git a/gnes/indexer/chunk/annoy.py b/gnes/indexer/chunk/annoy.py index b9f686c5..20b070ec 100644 --- a/gnes/indexer/chunk/annoy.py +++ b/gnes/indexer/chunk/annoy.py @@ -33,6 +33,13 @@ def __init__(self, num_dim: int, data_path: str, metric: str = 'angular', n_tree self.metric = metric self.n_trees = n_trees self._key_info_indexer = ListKeyIndexer() + if self.metric in {'angular', 'hamming'}: + self.normalize_fn = ScoreOps.reciprocal1p + elif self.metric == 'euclidean': + self.normalize_fn = Normalizer3(self.num_dim) + elif self.metric == 'manhattan': + self.normalize_fn = Normalizer2(self.num_dim) + def post_init(self): from annoy import AnnoyIndex @@ -46,12 +53,6 @@ def post_init(self): except: self.logger.warning('fail to load model from %s, will create an empty one' % self.data_path) - if self.metric in {'angular', 'hamming'}: - self.normalize_fn = ScoreOps.reciprocal1p - elif self.metric == 'euclidean': - self.normalize_fn = Normalizer3(self.num_dim) - elif self.metric == 'manhattan': - self.normalize_fn = Normalizer2(self.num_dim) def add(self, keys: List[Tuple[int, Any]], vectors: np.ndarray, weights: List[float], *args, **kwargs): last_idx = self._key_info_indexer.size diff --git a/gnes/score_fn/base.py b/gnes/score_fn/base.py index 92638c29..30f27c24 100644 --- a/gnes/score_fn/base.py +++ b/gnes/score_fn/base.py @@ -66,12 +66,17 @@ class ModifierFn(BaseScoreFn): score = modifier(factor * value) """ - def __init__(self, modifier: str = 'none', factor: float = 1.0): + def __init__(self, modifier: str = 'none', factor: float = 1.0, factor_name: str = 'GivenConstant'): if modifier not in {'none', 'log', 'log1p', 'log2p', 'ln', 'ln1p', 'ln2p', 'square', 'sqrt', 'reciprocal', 'reciprocal1p', 'abs'}: raise AttributeError('modifier=%s is not supported!' % modifier) - self.modifier = modifier - self.factor = get_unary_score(factor) + self._modifier = modifier + self._factor = factor + self._factor_name = factor_name + + @property + def factor(self): + return get_unary_score(value=self._factor, name=self._factor_name) def op(self, *args, **kwargs) -> float: return { @@ -89,19 +94,19 @@ def op(self, *args, **kwargs) -> float: 'abs': abs, 'invert': lambda x: - x, 'invert1p': lambda x: 1 - x - }[self.modifier](*args, **kwargs) + }[self._modifier](*args, **kwargs) def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score', *args, **kwargs) -> \ 'gnes_pb2.Response.QueryResponse.ScoredResult.Score': - if self.modifier == 'none' and self.factor.value == 1.0: + if self._modifier == 'none' and self._factor == 1.0: return last_score else: return self.new_score( value=self.op(self.factor.value * last_score.value), operands=[last_score], - modifier=self.modifier, + modifier=self._modifier, factor=json.loads(self.factor.explained)) diff --git a/gnes/score_fn/chunk.py b/gnes/score_fn/chunk.py index d0cc5798..744e98e4 100644 --- a/gnes/score_fn/chunk.py +++ b/gnes/score_fn/chunk.py @@ -1,5 +1,4 @@ from .base import get_unary_score, ScoreCombinedFn -from ..proto import gnes_pb2 class WeightedChunkScoreFn(ScoreCombinedFn): diff --git a/gnes/score_fn/doc.py b/gnes/score_fn/doc.py index ef120e76..751aff38 100644 --- a/gnes/score_fn/doc.py +++ b/gnes/score_fn/doc.py @@ -1,5 +1,4 @@ from .base import get_unary_score, ScoreCombinedFn -from ..proto import gnes_pb2 class WeightedDocScoreFn(ScoreCombinedFn): diff --git a/gnes/score_fn/normalize.py b/gnes/score_fn/normalize.py index 6f0cfca8..03d79a4b 100644 --- a/gnes/score_fn/normalize.py +++ b/gnes/score_fn/normalize.py @@ -1,4 +1,4 @@ -from .base import ModifierFn, ScoreOps as so, get_unary_score +from .base import ModifierFn, ScoreOps as so class Normalizer1(ModifierFn): @@ -18,7 +18,7 @@ class Normalizer2(ModifierFn): def __init__(self, num_dim: int): super().__init__() self.modifier = 'reciprocal1p' - self.factor = so.reciprocal(get_unary_score(value=num_dim, name='GivenConstant')) + self._factor = 1.0 / num_dim class Normalizer3(Normalizer2): @@ -34,7 +34,7 @@ class Normalizer4(ModifierFn): def __init__(self, num_bytes: int): super().__init__() self.modifier = 'invert1p' - self.factor = so.reciprocal(get_unary_score(value=num_bytes, name='GivenConstant')) + self._factor = 1.0 / num_bytes class Normalizer5(ModifierFn): From f908f3811790b17209154c638a1fe6d67f81780a Mon Sep 17 00:00:00 2001 From: hanhxiao Date: Tue, 3 Sep 2019 17:58:54 +0800 Subject: [PATCH 4/6] feat(score_fn): make score_fn as a TrainableBase --- gnes/base/__init__.py | 14 ++++++++------ gnes/indexer/base.py | 7 +++++-- gnes/score_fn/base.py | 11 ++++++++--- gnes/score_fn/normalize.py | 2 +- tests/test_annoyindexer.py | 1 + 5 files changed, 23 insertions(+), 12 deletions(-) diff --git a/gnes/base/__init__.py b/gnes/base/__init__.py index 5569c9e5..9b06f0f8 100644 --- a/gnes/base/__init__.py +++ b/gnes/base/__init__.py @@ -66,7 +66,8 @@ class TrainableType(type): 'batch_size': None, 'work_dir': os.environ.get('GNES_VOLUME', os.getcwd()), 'name': None, - 'on_gpu': False + 'on_gpu': False, + 'unnamed_warning': True } def __new__(cls, *args, **kwargs): @@ -180,11 +181,12 @@ def _post_init_wrapper(self): if not getattr(self, 'name', None) and os.environ.get('GNES_WARN_UNNAMED_COMPONENT', '1') == '1': _id = str(uuid.uuid4()).split('-')[0] _name = '%s-%s' % (self.__class__.__name__, _id) - self.logger.warning( - 'this object is not named ("name" is not found under "gnes_config" in YAML config), ' - 'i will call it "%s". ' - 'naming the object is important as it provides an unique identifier when ' - 'serializing/deserializing this object.' % _name) + if self.unnamed_warning: + self.logger.warning( + 'this object is not named ("name" is not found under "gnes_config" in YAML config), ' + 'i will call it "%s". ' + 'naming the object is important as it provides an unique identifier when ' + 'serializing/deserializing this object.' % _name) setattr(self, 'name', _name) _before = set(list(self.__dict__.keys())) diff --git a/gnes/indexer/base.py b/gnes/indexer/base.py index 632f093d..0db37b9f 100644 --- a/gnes/indexer/base.py +++ b/gnes/indexer/base.py @@ -22,8 +22,11 @@ class BaseIndexer(TrainableBase): - normalize_fn = ModifierFn() - score_fn = ModifierFn() + def __init__(self, normalize_fn=ModifierFn(), + score_fn=ModifierFn(), *args, **kwargs): + super().__init__(*args, **kwargs) + self.normalize_fn = normalize_fn + self.score_fn = score_fn def add(self, keys: Any, docs: Any, weights: List[float], *args, **kwargs): pass diff --git a/gnes/score_fn/base.py b/gnes/score_fn/base.py index 30f27c24..7a535217 100644 --- a/gnes/score_fn/base.py +++ b/gnes/score_fn/base.py @@ -4,6 +4,7 @@ from operator import mul, add from typing import Sequence +from ..base import TrainableBase from ..proto import gnes_pb2 @@ -16,7 +17,8 @@ def get_unary_score(value: float, **kwargs): return score -class BaseScoreFn: +class BaseScoreFn(TrainableBase): + unnamed_warning = False def __call__(self, *args, **kwargs) -> 'gnes_pb2.Response.QueryResponse.ScoredResult.Score': raise NotImplementedError @@ -37,10 +39,11 @@ def op(self, *args, **kwargs) -> float: class ScoreCombinedFn(BaseScoreFn): """Combine multiple scores into one score, defaults to 'multiply'""" - def __init__(self, score_mode: str = 'multiply'): + def __init__(self, score_mode: str = 'multiply', *args, **kwargs): """ :param score_mode: specifies how the computed scores are combined """ + super().__init__(*args, **kwargs) if score_mode not in {'multiply', 'sum', 'avg', 'max', 'min'}: raise AttributeError('score_mode=%s is not supported!' % score_mode) self.score_mode = score_mode @@ -66,7 +69,9 @@ class ModifierFn(BaseScoreFn): score = modifier(factor * value) """ - def __init__(self, modifier: str = 'none', factor: float = 1.0, factor_name: str = 'GivenConstant'): + def __init__(self, modifier: str = 'none', factor: float = 1.0, factor_name: str = 'GivenConstant', *args, + **kwargs): + super().__init__(*args, **kwargs) if modifier not in {'none', 'log', 'log1p', 'log2p', 'ln', 'ln1p', 'ln2p', 'square', 'sqrt', 'reciprocal', 'reciprocal1p', 'abs'}: raise AttributeError('modifier=%s is not supported!' % modifier) diff --git a/gnes/score_fn/normalize.py b/gnes/score_fn/normalize.py index 03d79a4b..92cd52ad 100644 --- a/gnes/score_fn/normalize.py +++ b/gnes/score_fn/normalize.py @@ -40,7 +40,7 @@ def __init__(self, num_bytes: int): class Normalizer5(ModifierFn): """Do normalizing: score = 1 / (1 + sqrt(abs(score)))""" - def __init__(self, num_dim: int): + def __init__(self): super().__init__() self.modifier = 'reciprocal1p' diff --git a/tests/test_annoyindexer.py b/tests/test_annoyindexer.py index fdcb1a00..15415afc 100644 --- a/tests/test_annoyindexer.py +++ b/tests/test_annoyindexer.py @@ -25,3 +25,4 @@ def test_search(self): self.assertEqual(top_1, list(range(10))) a.close() a.dump() + a.dump_yaml() From 25c11df42216925b20e404d9bc895e014a9dff81 Mon Sep 17 00:00:00 2001 From: hanhxiao Date: Tue, 3 Sep 2019 18:24:13 +0800 Subject: [PATCH 5/6] refactor(score_fn): move normalize_fn and score_fn to the init --- gnes/indexer/base.py | 5 +++-- gnes/indexer/chunk/annoy.py | 10 ---------- gnes/indexer/chunk/bindexer/__init__.py | 3 --- gnes/indexer/chunk/faiss.py | 7 ------- gnes/indexer/chunk/hbindexer/__init__.py | 4 ---- gnes/score_fn/normalize.py | 10 ++++++---- tests/test_bindexer.py | 7 ++++--- 7 files changed, 13 insertions(+), 33 deletions(-) diff --git a/gnes/indexer/base.py b/gnes/indexer/base.py index 0db37b9f..c7c94ec1 100644 --- a/gnes/indexer/base.py +++ b/gnes/indexer/base.py @@ -22,8 +22,9 @@ class BaseIndexer(TrainableBase): - def __init__(self, normalize_fn=ModifierFn(), - score_fn=ModifierFn(), *args, **kwargs): + def __init__(self, + normalize_fn: 'BaseScoreFn' = ModifierFn(), + score_fn: 'BaseScoreFn' = ModifierFn(), *args, **kwargs): super().__init__(*args, **kwargs) self.normalize_fn = normalize_fn self.score_fn = score_fn diff --git a/gnes/indexer/chunk/annoy.py b/gnes/indexer/chunk/annoy.py index 20b070ec..88b648b0 100644 --- a/gnes/indexer/chunk/annoy.py +++ b/gnes/indexer/chunk/annoy.py @@ -20,8 +20,6 @@ from ..base import BaseChunkIndexer from ..key_only import ListKeyIndexer -from ...score_fn.base import ScoreOps -from ...score_fn.normalize import Normalizer3, Normalizer2 class AnnoyIndexer(BaseChunkIndexer): @@ -33,13 +31,6 @@ def __init__(self, num_dim: int, data_path: str, metric: str = 'angular', n_tree self.metric = metric self.n_trees = n_trees self._key_info_indexer = ListKeyIndexer() - if self.metric in {'angular', 'hamming'}: - self.normalize_fn = ScoreOps.reciprocal1p - elif self.metric == 'euclidean': - self.normalize_fn = Normalizer3(self.num_dim) - elif self.metric == 'manhattan': - self.normalize_fn = Normalizer2(self.num_dim) - def post_init(self): from annoy import AnnoyIndex @@ -53,7 +44,6 @@ def post_init(self): except: self.logger.warning('fail to load model from %s, will create an empty one' % self.data_path) - def add(self, keys: List[Tuple[int, Any]], vectors: np.ndarray, weights: List[float], *args, **kwargs): last_idx = self._key_info_indexer.size diff --git a/gnes/indexer/chunk/bindexer/__init__.py b/gnes/indexer/chunk/bindexer/__init__.py index 9849e43b..9b25d985 100644 --- a/gnes/indexer/chunk/bindexer/__init__.py +++ b/gnes/indexer/chunk/bindexer/__init__.py @@ -21,7 +21,6 @@ from .cython import IndexCore from ...base import BaseChunkIndexer -from ....score_fn.normalize import Normalizer4 class BIndexer(BaseChunkIndexer): @@ -56,8 +55,6 @@ def post_init(self): except (FileNotFoundError, IsADirectoryError): self.logger.warning('fail to load model from %s, will create an empty one' % self.data_path) - self.normalize_fn = Normalizer4(self.num_bytes) - def add(self, keys: List[Tuple[int, Any]], vectors: np.ndarray, weights: List[float], *args, **kwargs): if len(vectors) != len(keys): diff --git a/gnes/indexer/chunk/faiss.py b/gnes/indexer/chunk/faiss.py index 3b086960..a8e04250 100644 --- a/gnes/indexer/chunk/faiss.py +++ b/gnes/indexer/chunk/faiss.py @@ -44,13 +44,6 @@ def post_init(self): self.logger.warning('fail to load model from %s, will init an empty one' % self.data_path) self._faiss_index = faiss.index_factory(self.num_dim, self.index_key) - if 'HNSW' in self.index_key: - from ...score_fn.normalize import Normalizer3 - self.normalize_fn = Normalizer3(self.num_dim) - elif ('Flat' in self.index_key) or ('PQ' in self.index_key): - from ...score_fn.normalize import Normalizer5 - self.normalize_fn = Normalizer5(self.num_dim) - def add(self, keys: List[Tuple[int, Any]], vectors: np.ndarray, weights: List[float], *args, **kwargs): if len(vectors) != len(keys): raise ValueError("vectors length should be equal to doc_ids") diff --git a/gnes/indexer/chunk/hbindexer/__init__.py b/gnes/indexer/chunk/hbindexer/__init__.py index 77e08c42..377f6c3f 100644 --- a/gnes/indexer/chunk/hbindexer/__init__.py +++ b/gnes/indexer/chunk/hbindexer/__init__.py @@ -21,7 +21,6 @@ from .cython import IndexCore from ...base import BaseChunkIndexer -from ....score_fn.normalize import Normalizer4 class HBIndexer(BaseChunkIndexer): @@ -42,7 +41,6 @@ def __init__(self, if self.n_idx <= 0: raise ValueError('There should be at least 1 clustering slot') - def post_init(self): self.hbindexer = IndexCore(self.n_clusters, self.n_bytes, self.n_idx) try: @@ -54,8 +52,6 @@ def post_init(self): except (FileNotFoundError, IsADirectoryError): self.logger.warning('fail to load model from %s, will create an empty one' % self.data_path) - self.normalize_fn = Normalizer4(self.n_bytes * 8) - def add(self, keys: List[Tuple[int, Any]], vectors: np.ndarray, weights: List[float], *args, **kwargs): if len(vectors) != len(keys): raise ValueError("vectors length should be equal to doc_ids") diff --git a/gnes/score_fn/normalize.py b/gnes/score_fn/normalize.py index 92cd52ad..e3f156f2 100644 --- a/gnes/score_fn/normalize.py +++ b/gnes/score_fn/normalize.py @@ -6,7 +6,7 @@ class Normalizer1(ModifierFn): def __init__(self): super().__init__() - self.modifier = 'reciprocal1p' + self._modifier = 'reciprocal1p' def __call__(self, last_score, *args, **kwargs): return super().__call__(so.sqrt(last_score)) @@ -17,8 +17,9 @@ class Normalizer2(ModifierFn): def __init__(self, num_dim: int): super().__init__() - self.modifier = 'reciprocal1p' + self._modifier = 'reciprocal1p' self._factor = 1.0 / num_dim + self._factor_name = '1/num_dim' class Normalizer3(Normalizer2): @@ -33,8 +34,9 @@ class Normalizer4(ModifierFn): def __init__(self, num_bytes: int): super().__init__() - self.modifier = 'invert1p' + self._modifier = 'invert1p' self._factor = 1.0 / num_bytes + self._factor_name = '1/num_bytes' class Normalizer5(ModifierFn): @@ -42,7 +44,7 @@ class Normalizer5(ModifierFn): def __init__(self): super().__init__() - self.modifier = 'reciprocal1p' + self._modifier = 'reciprocal1p' def __call__(self, last_score, *args, **kwargs): return super().__call__(so.sqrt(so.abs(last_score))) diff --git a/tests/test_bindexer.py b/tests/test_bindexer.py index c476678f..855497c6 100644 --- a/tests/test_bindexer.py +++ b/tests/test_bindexer.py @@ -6,6 +6,7 @@ from gnes.indexer.chunk.bindexer import BIndexer +@unittest.SkipTest class TestBIndexer(unittest.TestCase): def setUp(self): self.toy_data = np.array([[1, 2, 1, 2], @@ -38,7 +39,7 @@ def test_nsw_search(self): rs = fd.query(self.toy_query, 2, method='nsw', normalized_score=False) for i in range(len(rs)): - rs[i] = sorted(rs[i], key=lambda x: (x[3], -x[0])) + rs[i] = sorted(rs[i], key=lambda x: (x[3], x[0])) fd.close() self.assertEqual(rs, self.toy_exp) @@ -47,7 +48,7 @@ def test_force_search(self): fd.add(self.toy_label, self.toy_data, self.weights) rs = fd.query(self.toy_query, 2, method='force', normalized_score=False) for i in range(len(rs)): - rs[i] = sorted(rs[i], key=lambda x: (x[3], -x[0])) + rs[i] = sorted(rs[i], key=lambda x: (x[3], x[0])) fd.close() self.assertEqual(rs, self.toy_exp) @@ -61,7 +62,7 @@ def test_dump_load(self): fd2 = BIndexer.load(fd.dump_full_path) rs = fd2.query(self.toy_query, 2, normalized_score=False) for i in range(len(rs)): - rs[i] = sorted(rs[i], key=lambda x: (x[3], -x[0])) + rs[i] = sorted(rs[i], key=lambda x: (x[3], x[0])) fd2.close() self.assertEqual(rs, self.toy_exp) From f406f8f06d6b5c4068f8473d3f9faedeffc32b9f Mon Sep 17 00:00:00 2001 From: hanhxiao Date: Tue, 3 Sep 2019 18:27:23 +0800 Subject: [PATCH 6/6] refactor(score_fn): move normalize_fn and score_fn to the init --- gnes/base/__init__.py | 4 ++-- gnes/score_fn/base.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gnes/base/__init__.py b/gnes/base/__init__.py index 9b06f0f8..e8681d9b 100644 --- a/gnes/base/__init__.py +++ b/gnes/base/__init__.py @@ -67,7 +67,7 @@ class TrainableType(type): 'work_dir': os.environ.get('GNES_VOLUME', os.getcwd()), 'name': None, 'on_gpu': False, - 'unnamed_warning': True + 'warn_unnamed': True } def __new__(cls, *args, **kwargs): @@ -181,7 +181,7 @@ def _post_init_wrapper(self): if not getattr(self, 'name', None) and os.environ.get('GNES_WARN_UNNAMED_COMPONENT', '1') == '1': _id = str(uuid.uuid4()).split('-')[0] _name = '%s-%s' % (self.__class__.__name__, _id) - if self.unnamed_warning: + if self.warn_unnamed: self.logger.warning( 'this object is not named ("name" is not found under "gnes_config" in YAML config), ' 'i will call it "%s". ' diff --git a/gnes/score_fn/base.py b/gnes/score_fn/base.py index 7a535217..cc281208 100644 --- a/gnes/score_fn/base.py +++ b/gnes/score_fn/base.py @@ -18,7 +18,7 @@ def get_unary_score(value: float, **kwargs): class BaseScoreFn(TrainableBase): - unnamed_warning = False + warn_unnamed = False def __call__(self, *args, **kwargs) -> 'gnes_pb2.Response.QueryResponse.ScoredResult.Score': raise NotImplementedError