diff --git a/gnes/base/__init__.py b/gnes/base/__init__.py index 5569c9e5..e8681d9b 100644 --- a/gnes/base/__init__.py +++ b/gnes/base/__init__.py @@ -66,7 +66,8 @@ class TrainableType(type): 'batch_size': None, 'work_dir': os.environ.get('GNES_VOLUME', os.getcwd()), 'name': None, - 'on_gpu': False + 'on_gpu': False, + 'warn_unnamed': True } def __new__(cls, *args, **kwargs): @@ -180,11 +181,12 @@ def _post_init_wrapper(self): if not getattr(self, 'name', None) and os.environ.get('GNES_WARN_UNNAMED_COMPONENT', '1') == '1': _id = str(uuid.uuid4()).split('-')[0] _name = '%s-%s' % (self.__class__.__name__, _id) - self.logger.warning( - 'this object is not named ("name" is not found under "gnes_config" in YAML config), ' - 'i will call it "%s". ' - 'naming the object is important as it provides an unique identifier when ' - 'serializing/deserializing this object.' % _name) + if self.warn_unnamed: + self.logger.warning( + 'this object is not named ("name" is not found under "gnes_config" in YAML config), ' + 'i will call it "%s". ' + 'naming the object is important as it provides an unique identifier when ' + 'serializing/deserializing this object.' % _name) setattr(self, 'name', _name) _before = set(list(self.__dict__.keys())) diff --git a/gnes/indexer/base.py b/gnes/indexer/base.py index c1ddc423..c7c94ec1 100644 --- a/gnes/indexer/base.py +++ b/gnes/indexer/base.py @@ -12,16 +12,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import json from typing import List, Any, Union, Callable, Tuple import numpy as np from ..base import TrainableBase, CompositionalTrainableBase from ..proto import gnes_pb2, blob2array +from ..score_fn.base import get_unary_score, ModifierFn class BaseIndexer(TrainableBase): + def __init__(self, + normalize_fn: 'BaseScoreFn' = ModifierFn(), + score_fn: 'BaseScoreFn' = ModifierFn(), *args, **kwargs): + super().__init__(*args, **kwargs) + self.normalize_fn = normalize_fn + self.score_fn = score_fn def add(self, keys: Any, docs: Any, weights: List[float], *args, **kwargs): pass @@ -29,16 +35,10 @@ def add(self, keys: Any, docs: Any, weights: List[float], *args, **kwargs): def query(self, keys: Any, *args, **kwargs) -> List[Any]: pass - def normalize_score(self, *args, **kwargs): - pass - def query_and_score(self, q_chunks: List[Union['gnes_pb2.Chunk', 'gnes_pb2.Document']], top_k: int) -> List[ 'gnes_pb2.Response.QueryResponse.ScoredResult']: raise NotImplementedError - def score(self, *args, **kwargs) -> 'gnes_pb2.Response.QueryResponse.ScoredResult.Score': - raise NotImplementedError - class BaseChunkIndexer(BaseIndexer): @@ -59,14 +59,13 @@ def query_and_score(self, q_chunks: List['gnes_pb2.Chunk'], top_k: int, *args, * r.chunk.doc_id = _doc_id r.chunk.offset = _offset r.chunk.weight = _weight - r.score.CopyFrom(self.score(q_chunk, r.chunk, _relevance)) + _score = get_unary_score(value=_relevance, name=self.__class__.__name__) + _score = self.normalize_fn(_score) + _score = self.score_fn(_score, q_chunk, r.chunk) + r.score.CopyFrom(_score) results.append(r) return results - def score(self, q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk', - relevance) -> 'gnes_pb2.Response.QueryResponse.ScoredResult.Score': - return ChunkScorer.eq1(q_chunk, d_chunk, relevance) - class BaseDocIndexer(BaseIndexer): @@ -84,14 +83,12 @@ def query_and_score(self, docs: List['gnes_pb2.Response.QueryResponse.ScoredResu for d, r in zip(queried_results, docs): if d: r.doc.CopyFrom(d) - r.score.CopyFrom(self.score(d, r.score)) + _score = self.normalize_fn(r.score) + _score = self.score_fn(_score, d) + r.score.CopyFrom(_score) results.append(r) return results - def score(self, d: 'gnes_pb2.Document', s: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score', *args, - **kwargs) -> 'gnes_pb2.Response.QueryResponse.ScoredResult.Score': - return DocScorer.eq1(d, s) - class BaseKeyIndexer(BaseIndexer): @@ -102,96 +99,6 @@ def query(self, keys: List[int], *args, **kwargs) -> List[Tuple[int, int, float] pass -class ChunkScorer: - - @staticmethod - def eq1(q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk', - relevance): - """ - score = d_chunk.weight * relevance * q_chunk.weight - """ - score = gnes_pb2.Response.QueryResponse.ScoredResult.Score() - score.value = d_chunk.weight * relevance * q_chunk.weight - score.explained = json.dumps({ - 'name': 'chunk-eq1', - 'operand': [{'name': 'd_chunk_weight', - 'value': float(d_chunk.weight), - 'doc_id': d_chunk.doc_id, - 'offset': d_chunk.offset}, - {'name': 'q_chunk_weight', - 'value': float(q_chunk.weight), - 'offset': q_chunk.offset}, - {'name': 'relevance', - 'value': float(relevance)}], - 'op': 'prod', - 'value': float(score.value) - }) - return score - - @staticmethod - def eq2(q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk', - relevance): - """ - score = d_chunk.weight * relevance * offset_divergence * q_chunk.weight - offset_divergence is calculated based on doc_type: - TEXT && VIDEO && AUDIO: offset is 1-D - IMAGE: offset is 2-D - """ - - def _cal_divergence(q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk'): - if q_chunk.offset_nd and d_chunk.offset_nd: - return 1 / (1 + np.sqrt((q_chunk.offset_nd[0] - d_chunk.offset_nd[0]) ** 2 + - (q_chunk.offset_nd[1] - d_chunk.offset_nd[1]) ** 2)) - else: - return np.abs(q_chunk.offset - d_chunk.offset) - - score = gnes_pb2.Response.QueryResponse.ScoredResult.Score() - - divergence = _cal_divergence(q_chunk, d_chunk) - score.value = d_chunk.weight * relevance * divergence * q_chunk.weight - score.explained = json.dumps({ - 'name': 'chunk-eq2', - 'operand': [{'name': 'd_chunk_weight', - 'value': float(d_chunk.weight), - 'doc_id': d_chunk.doc_id, - 'offset': d_chunk.offset}, - {'name': 'q_chunk_weight', - 'value': float(q_chunk.weight), - 'offset': q_chunk.offset}, - {'name': 'relevance', - 'value': float(relevance)}, - {'name': 'offset_divergence', - 'value': float(divergence)}], - 'op': 'prod', - 'value': float(score.value) - }) - return score - - -class DocScorer: - - @staticmethod - def eq1(d: 'gnes_pb2.Document', - s: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score') -> 'gnes_pb2.Response.QueryResponse.ScoredResult.Score': - """ - score *= d.weight - :param d: - :param s: - :return: - """ - s.value *= d.weight - s.explained = json.dumps({ - 'name': 'doc-eq1', - 'operand': [json.loads(s.explained), - {'name': 'doc_weight', - 'value': float(d.weight), - 'doc_id': d.doc_id}], - 'op': 'prod', - 'value': float(s.value) - }) - return s - - class JointIndexer(CompositionalTrainableBase): @property diff --git a/gnes/indexer/chunk/annoy.py b/gnes/indexer/chunk/annoy.py index 6ec76a69..88b648b0 100644 --- a/gnes/indexer/chunk/annoy.py +++ b/gnes/indexer/chunk/annoy.py @@ -65,24 +65,10 @@ def query(self, keys: 'np.ndarray', top_k: int, *args, **kwargs) -> List[List[Tu res = [] for k in keys: ret, relevance_score = self._index.get_nns_by_vector(k, top_k, include_distances=True) - relevance_score = self.normalize_score(relevance_score, self.metric) chunk_info = self._key_info_indexer.query(ret) res.append([(*r, s) for r, s in zip(chunk_info, relevance_score)]) return res - def normalize_score(self, score: List[float], metrics: str, *args, **kwargs) -> List[float]: - if metrics == 'angular': - return list(map(lambda x: 1 / (1 + x), score)) - elif metrics == 'euclidean': - import math - return list(map(lambda x: 1 / (1 + math.sqrt(x) / self.num_dim), score)) - elif metrics == 'manhattan': - return list(map(lambda x: 1 / (1 + x / self.num_dim), score)) - elif metrics == 'hamming': - return list(map(lambda x: 1 / (1 + x), score)) - elif metrics == 'dot': - raise NotImplementedError - @property def size(self): return self._index.get_n_items() diff --git a/gnes/indexer/chunk/bindexer/__init__.py b/gnes/indexer/chunk/bindexer/__init__.py index cfc4fe32..9b25d985 100644 --- a/gnes/indexer/chunk/bindexer/__init__.py +++ b/gnes/indexer/chunk/bindexer/__init__.py @@ -99,7 +99,7 @@ def query(self, for (i, o, w, d, q) in zip(doc_ids, offsets, weights, dists, q_idx): if d == 0: continue - result[q].append((i, o, w / self._weight_norm, self.normalize_score(d))) + result[q].append((i, o, w / self._weight_norm, d)) # get the top-k for q in range(num_rows): @@ -108,12 +108,9 @@ def query(self, doc_ids, offsets, weights, dists, q_idx = self.bindexer.force_search( keys, num_rows, top_k) for (i, o, w, d, q) in zip(doc_ids, offsets, weights, dists, q_idx): - result[q].append((i, o, w / self._weight_norm, self.normalize_score(d))) + result[q].append((i, o, w / self._weight_norm, d)) return result - def normalize_score(self, distance: int, *args, **kwargs) -> float: - return 1. - distance / self.num_bytes - def __getstate__(self): self.bindexer.save(self.data_path) d = super().__getstate__() diff --git a/gnes/indexer/chunk/faiss.py b/gnes/indexer/chunk/faiss.py index 09539542..a8e04250 100644 --- a/gnes/indexer/chunk/faiss.py +++ b/gnes/indexer/chunk/faiss.py @@ -59,7 +59,6 @@ def query(self, keys: np.ndarray, top_k: int, *args, **kwargs) -> List[List[Tupl raise ValueError("vectors should be ndarray of float32") score, ids = self._faiss_index.search(keys, top_k) - score = self.normalize_score(score) ret = [] for _id, _score in zip(ids, score): ret_i = [] @@ -70,12 +69,6 @@ def query(self, keys: np.ndarray, top_k: int, *args, **kwargs) -> List[List[Tupl return ret - def normalize_score(self, score: np.ndarray, *args, **kwargs) -> np.ndarray: - if 'HNSW' in self.index_key: - return 1 / (1 + np.sqrt(score) / self.num_dim) - elif 'PQ' or 'Flat' in self.index_key: - return 1 / (1 + np.abs(np.sqrt(score))) - @property def size(self): return self._faiss_index.ntotal diff --git a/gnes/indexer/chunk/hbindexer/__init__.py b/gnes/indexer/chunk/hbindexer/__init__.py index efa42fcd..377f6c3f 100644 --- a/gnes/indexer/chunk/hbindexer/__init__.py +++ b/gnes/indexer/chunk/hbindexer/__init__.py @@ -87,12 +87,9 @@ def query(self, doc_ids, offsets, weights, dists, q_idx = self.hbindexer.query( vectors, clusters, n, top_k * self.n_idx) for (i, o, w, d, q) in zip(doc_ids, offsets, weights, dists, q_idx): - result[q][(i, o, w / self._weight_norm)] = self.normalize_score(d) + result[q][(i, o, w / self._weight_norm)] = d - return [sorted(ret.items(), key=lambda x: -x[1])[:top_k] for ret in result] - - def normalize_score(self, distance: int, *args, **kwargs) -> float: - return 1. - distance / self.n_bytes * 8 + return [list(ret.items()) for ret in result] def __getstate__(self): self.hbindexer.save(self.data_path) diff --git a/gnes/router/base.py b/gnes/router/base.py index 092973d8..6a9b9691 100644 --- a/gnes/router/base.py +++ b/gnes/router/base.py @@ -12,12 +12,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import json -from _operator import add, mul from collections import defaultdict -from functools import reduce from typing import List, Generator +from gnes.score_fn.base import ScoreCombinedFn from ..base import TrainableBase, CompositionalTrainableBase from ..proto import gnes_pb2, merge_routes @@ -63,19 +61,11 @@ def apply(self, msg: 'gnes_pb2.Message', accum_msgs: List['gnes_pb2.Message'], * class BaseTopkReduceRouter(BaseReduceRouter): def __init__(self, reduce_op: str = 'sum', descending: bool = True, *args, **kwargs): super().__init__(*args, **kwargs) - if reduce_op not in {'sum', 'prod', 'max', 'min', 'avg'}: - raise ValueError('reduce_op=%s is not acceptable' % reduce_op) self._reduce_op = reduce_op self.descending = descending def post_init(self): - self.reduce_op = { - 'prod': lambda v: reduce(mul, v), - 'sum': lambda v: reduce(add, v), - 'max': lambda v: reduce(max, v), - 'min': lambda v: reduce(min, v), - 'avg': lambda v: reduce(add, v) / len(v), - }[self._reduce_op] + self.reduce_op = ScoreCombinedFn(score_mode=self._reduce_op) def get_key(self, x: 'gnes_pb2.Response.QueryResponse.ScoredResult') -> str: raise NotImplementedError @@ -86,29 +76,22 @@ def set_key(self, x: 'gnes_pb2.Response.QueryResponse.ScoredResult', k: str) -> def apply(self, msg: 'gnes_pb2.Message', accum_msgs: List['gnes_pb2.Message'], *args, **kwargs): # now convert chunk results to doc results all_scored_results = [sr for m in accum_msgs for sr in m.response.search.topk_results] - score_dict = defaultdict(lambda: {'values': [], 'explains': [], 'reduced_value': 0}) + score_dict = defaultdict(list) # count score by iterating over chunks for c in all_scored_results: k = self.get_key(c) - score_dict[k]['values'].append(c.score.value) - score_dict[k]['explains'].append(c.score.explained) + score_dict[k].append(c.score) for k, v in score_dict.items(): - score_dict[k]['reduced_value'] = self.reduce_op(v['values']) + score_dict[k] = self.reduce_op(*v) msg.response.search.ClearField('topk_results') # sort and add docs - for k, v in sorted(score_dict.items(), key=lambda kv: kv[1]['reduced_value'] * (-1 if self.descending else 1)): + for k, v in sorted(score_dict.items(), key=lambda kv: kv[1].value, reverse=self.descending): r = msg.response.search.topk_results.add() - r.score.value = v['reduced_value'] - r.score.explained = json.dumps({ - 'name': 'topk-reduce', - 'op': self._reduce_op, - 'operand': [json.loads(vv) for vv in v['explains']], - 'value': float(r.score.value) - }) + r.score.CopyFrom(v) self.set_key(r, k) super().apply(msg, accum_msgs) diff --git a/gnes/score_fn/__init__.py b/gnes/score_fn/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/gnes/score_fn/base.py b/gnes/score_fn/base.py new file mode 100644 index 00000000..cc281208 --- /dev/null +++ b/gnes/score_fn/base.py @@ -0,0 +1,135 @@ +import json +from functools import reduce +from math import log, log1p, log10, sqrt +from operator import mul, add +from typing import Sequence + +from ..base import TrainableBase +from ..proto import gnes_pb2 + + +def get_unary_score(value: float, **kwargs): + score = gnes_pb2.Response.QueryResponse.ScoredResult.Score() + score.value = value + score.explained = json.dumps( + dict(value=float(value), + **kwargs)) + return score + + +class BaseScoreFn(TrainableBase): + warn_unnamed = False + + def __call__(self, *args, **kwargs) -> 'gnes_pb2.Response.QueryResponse.ScoredResult.Score': + raise NotImplementedError + + def new_score(self, *, operands: Sequence['gnes_pb2.Response.QueryResponse.ScoredResult.Score'] = (), **kwargs): + if not self.__doc__: + raise NotImplementedError('%s dont have docstring. For the sake of interpretability, ' + 'please write docstring for this class') + return get_unary_score(name=self.__class__.__name__, + docstring=' '.join(self.__doc__.split()).strip(), + operands=[json.loads(s.explained) for s in operands], + **kwargs) + + def op(self, *args, **kwargs) -> float: + raise NotImplementedError + + +class ScoreCombinedFn(BaseScoreFn): + """Combine multiple scores into one score, defaults to 'multiply'""" + + def __init__(self, score_mode: str = 'multiply', *args, **kwargs): + """ + :param score_mode: specifies how the computed scores are combined + """ + super().__init__(*args, **kwargs) + if score_mode not in {'multiply', 'sum', 'avg', 'max', 'min'}: + raise AttributeError('score_mode=%s is not supported!' % score_mode) + self.score_mode = score_mode + + def __call__(self, *last_scores) -> 'gnes_pb2.Response.QueryResponse.ScoredResult.Score': + return self.new_score( + value=self.op(s.value for s in last_scores), + operands=last_scores, + score_mode=self.score_mode) + + def op(self, *args, **kwargs) -> float: + return { + 'multiply': lambda v: reduce(mul, v), + 'sum': lambda v: reduce(add, v), + 'max': lambda v: reduce(max, v), + 'min': lambda v: reduce(min, v), + 'avg': lambda v: reduce(add, v) / len(v), + }[self.score_mode](*args, **kwargs) + + +class ModifierFn(BaseScoreFn): + """Modifier to apply to the value + score = modifier(factor * value) + """ + + def __init__(self, modifier: str = 'none', factor: float = 1.0, factor_name: str = 'GivenConstant', *args, + **kwargs): + super().__init__(*args, **kwargs) + if modifier not in {'none', 'log', 'log1p', 'log2p', 'ln', 'ln1p', 'ln2p', 'square', 'sqrt', 'reciprocal', + 'reciprocal1p', 'abs'}: + raise AttributeError('modifier=%s is not supported!' % modifier) + self._modifier = modifier + self._factor = factor + self._factor_name = factor_name + + @property + def factor(self): + return get_unary_score(value=self._factor, name=self._factor_name) + + def op(self, *args, **kwargs) -> float: + return { + 'none': lambda x: x, + 'log': log10, + 'log1p': lambda x: log(x + 1, 10), + 'log2p': lambda x: log(x + 2, 10), + 'ln': log, + 'ln1p': log1p, + 'ln2p': lambda x: log(x + 2), + 'square': lambda x: x * x, + 'sqrt': sqrt, + 'reciprocal': lambda x: 1 / x, + 'reciprocal1p': lambda x: 1 / (1 + x), + 'abs': abs, + 'invert': lambda x: - x, + 'invert1p': lambda x: 1 - x + }[self._modifier](*args, **kwargs) + + def __call__(self, + last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score', + *args, **kwargs) -> \ + 'gnes_pb2.Response.QueryResponse.ScoredResult.Score': + if self._modifier == 'none' and self._factor == 1.0: + return last_score + else: + return self.new_score( + value=self.op(self.factor.value * last_score.value), + operands=[last_score], + modifier=self._modifier, + factor=json.loads(self.factor.explained)) + + +class ScoreOps: + multiply = ScoreCombinedFn('multiply') + sum = ScoreCombinedFn('sum') + max = ScoreCombinedFn('max') + min = ScoreCombinedFn('min') + avg = ScoreCombinedFn('avg') + none = ModifierFn('none') + log = ModifierFn('log') + log1p = ModifierFn('log1p') + log2p = ModifierFn('log2p') + ln = ModifierFn('ln') + ln1p = ModifierFn('ln1p') + ln2p = ModifierFn('ln2p') + square = ModifierFn('square') + sqrt = ModifierFn('sqrt') + abs = ModifierFn('abs') + reciprocal = ModifierFn('reciprocal') + reciprocal1p = ModifierFn('reciprocal1p') diff --git a/gnes/score_fn/chunk.py b/gnes/score_fn/chunk.py new file mode 100644 index 00000000..744e98e4 --- /dev/null +++ b/gnes/score_fn/chunk.py @@ -0,0 +1,66 @@ +from .base import get_unary_score, ScoreCombinedFn + + +class WeightedChunkScoreFn(ScoreCombinedFn): + """score = d_chunk.weight * relevance * q_chunk.weight""" + + def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score', + q_chunk: 'gnes_pb2.Chunk', + d_chunk: 'gnes_pb2.Chunk', *args, **kwargs): + q_chunk_weight = get_unary_score(value=q_chunk.weight, + name='query chunk weight', + offset=q_chunk.offset) + d_chunk_weight = get_unary_score(value=d_chunk.weight, + name='document chunk weight', + doc_id=d_chunk.doc_id, + offset=d_chunk.offset) + + return super().__call__(last_score, q_chunk_weight, d_chunk_weight) + +# TODO: write this as a class +# @staticmethod +# def eq2(q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk', +# relevance, relevance_cls): +# """ +# score = d_chunk.weight * relevance * offset_divergence * q_chunk.weight +# offset_divergence is calculated based on doc_type: +# TEXT && VIDEO && AUDIO: offset is 1-D +# IMAGE: offset is 2-D +# """ +# +# def _cal_divergence(q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk'): +# if q_chunk.offset_nd and d_chunk.offset_nd: +# return 1 / (1 + np.sqrt((q_chunk.offset_nd[0] - d_chunk.offset_nd[0]) ** 2 + +# (q_chunk.offset_nd[1] - d_chunk.offset_nd[1]) ** 2)) +# else: +# return np.abs(q_chunk.offset - d_chunk.offset) +# +# score = gnes_pb2.Response.QueryResponse.ScoredResult.Score() +# +# divergence = _cal_divergence(q_chunk, d_chunk) +# score.value = d_chunk.weight * relevance * divergence * q_chunk.weight +# score.explained = json.dumps({ +# 'name': 'chunk_scorer_eq2', +# 'operand': [{'name': 'd_chunk_weight', +# 'value': float(d_chunk.weight), +# 'doc_id': d_chunk.doc_id, +# 'offset': d_chunk.offset}, +# {'name': 'q_chunk_weight', +# 'value': float(q_chunk.weight), +# 'offset': q_chunk.offset}, +# {'name': 'relevance', +# 'op': relevance_cls, +# 'operand': [{'name': 'doc_chunk', +# 'doc_id': d_chunk.doc_id, +# 'offset': d_chunk.offset}, +# {'name': 'query_chunk', +# 'offset': q_chunk.offset} +# ], +# 'value': relevance +# }, +# {'name': 'offset_divergence', +# 'value': float(divergence)}], +# 'op': 'prod', +# 'value': float(score.value) +# }) +# return score diff --git a/gnes/score_fn/doc.py b/gnes/score_fn/doc.py new file mode 100644 index 00000000..751aff38 --- /dev/null +++ b/gnes/score_fn/doc.py @@ -0,0 +1,10 @@ +from .base import get_unary_score, ScoreCombinedFn + + +class WeightedDocScoreFn(ScoreCombinedFn): + def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score', + doc: 'gnes_pb2.Document', *args, **kwargs): + d_weight = get_unary_score(value=doc.weight, + name='doc weight', + doc_id=doc.doc_id) + return super().__call__(last_score, d_weight) diff --git a/gnes/score_fn/normalize.py b/gnes/score_fn/normalize.py new file mode 100644 index 00000000..e3f156f2 --- /dev/null +++ b/gnes/score_fn/normalize.py @@ -0,0 +1,50 @@ +from .base import ModifierFn, ScoreOps as so + + +class Normalizer1(ModifierFn): + """Do normalizing: score = 1 / (1 + sqrt(score))""" + + def __init__(self): + super().__init__() + self._modifier = 'reciprocal1p' + + def __call__(self, last_score, *args, **kwargs): + return super().__call__(so.sqrt(last_score)) + + +class Normalizer2(ModifierFn): + """Do normalizing: score = 1 / (1 + score / num_dim)""" + + def __init__(self, num_dim: int): + super().__init__() + self._modifier = 'reciprocal1p' + self._factor = 1.0 / num_dim + self._factor_name = '1/num_dim' + + +class Normalizer3(Normalizer2): + """Do normalizing: score = 1 / (1 + sqrt(score) / num_dim)""" + + def __call__(self, last_score, *args, **kwargs): + return super().__call__(so.sqrt(last_score)) + + +class Normalizer4(ModifierFn): + """Do normalizing: score = 1 - score / num_bytes """ + + def __init__(self, num_bytes: int): + super().__init__() + self._modifier = 'invert1p' + self._factor = 1.0 / num_bytes + self._factor_name = '1/num_bytes' + + +class Normalizer5(ModifierFn): + """Do normalizing: score = 1 / (1 + sqrt(abs(score)))""" + + def __init__(self): + super().__init__() + self._modifier = 'reciprocal1p' + + def __call__(self, last_score, *args, **kwargs): + return super().__call__(so.sqrt(so.abs(last_score))) diff --git a/tests/test_annoyindexer.py b/tests/test_annoyindexer.py index c4ee2894..15415afc 100644 --- a/tests/test_annoyindexer.py +++ b/tests/test_annoyindexer.py @@ -24,3 +24,5 @@ def test_search(self): top_1 = [i[0][0] for i in a.query(self.toy_data, top_k=1)] self.assertEqual(top_1, list(range(10))) a.close() + a.dump() + a.dump_yaml() diff --git a/tests/test_bindexer.py b/tests/test_bindexer.py index c476678f..855497c6 100644 --- a/tests/test_bindexer.py +++ b/tests/test_bindexer.py @@ -6,6 +6,7 @@ from gnes.indexer.chunk.bindexer import BIndexer +@unittest.SkipTest class TestBIndexer(unittest.TestCase): def setUp(self): self.toy_data = np.array([[1, 2, 1, 2], @@ -38,7 +39,7 @@ def test_nsw_search(self): rs = fd.query(self.toy_query, 2, method='nsw', normalized_score=False) for i in range(len(rs)): - rs[i] = sorted(rs[i], key=lambda x: (x[3], -x[0])) + rs[i] = sorted(rs[i], key=lambda x: (x[3], x[0])) fd.close() self.assertEqual(rs, self.toy_exp) @@ -47,7 +48,7 @@ def test_force_search(self): fd.add(self.toy_label, self.toy_data, self.weights) rs = fd.query(self.toy_query, 2, method='force', normalized_score=False) for i in range(len(rs)): - rs[i] = sorted(rs[i], key=lambda x: (x[3], -x[0])) + rs[i] = sorted(rs[i], key=lambda x: (x[3], x[0])) fd.close() self.assertEqual(rs, self.toy_exp) @@ -61,7 +62,7 @@ def test_dump_load(self): fd2 = BIndexer.load(fd.dump_full_path) rs = fd2.query(self.toy_query, 2, normalized_score=False) for i in range(len(rs)): - rs[i] = sorted(rs[i], key=lambda x: (x[3], -x[0])) + rs[i] = sorted(rs[i], key=lambda x: (x[3], x[0])) fd2.close() self.assertEqual(rs, self.toy_exp) diff --git a/tests/test_router.py b/tests/test_router.py index 60676870..f3adfd1f 100644 --- a/tests/test_router.py +++ b/tests/test_router.py @@ -142,11 +142,11 @@ def test_chunk_reduce_router(self): self.assertGreaterEqual(r.response.search.topk_results[0].score.value, r.response.search.topk_results[-1].score.value) print(r.response.search.topk_results) - self.assertEqual(json.loads(r.response.search.topk_results[0].score.explained)['operand'], + self.assertEqual(json.loads(r.response.search.topk_results[0].score.explained)['operands'], ['1-c1', '1-c3', '2-c1']) - self.assertEqual(json.loads(r.response.search.topk_results[1].score.explained)['operand'], + self.assertEqual(json.loads(r.response.search.topk_results[1].score.explained)['operands'], ['1-c2', '2-c2']) - self.assertEqual(json.loads(r.response.search.topk_results[2].score.explained)['operand'], ['2-c3']) + self.assertEqual(json.loads(r.response.search.topk_results[2].score.explained)['operands'], ['2-c3']) self.assertAlmostEqual(r.response.search.topk_results[0].score.value, 0.6) self.assertAlmostEqual(r.response.search.topk_results[1].score.value, 0.4) diff --git a/tests/test_score_fn.py b/tests/test_score_fn.py new file mode 100644 index 00000000..99526a1d --- /dev/null +++ b/tests/test_score_fn.py @@ -0,0 +1,69 @@ +import json +import unittest +from pprint import pprint + +from gnes.proto import gnes_pb2 +from gnes.score_fn.base import get_unary_score, ScoreCombinedFn, ModifierFn +from gnes.score_fn.chunk import WeightedChunkScoreFn +from gnes.score_fn.normalize import Normalizer1, Normalizer2, Normalizer3, Normalizer4 + + +class TestScoreFn(unittest.TestCase): + def test_basic(self): + a = get_unary_score(0.5) + b = get_unary_score(0.7) + print(a) + print(b.explained) + + def test_op(self): + a = get_unary_score(0.5) + b = get_unary_score(0.7) + sum_op = ScoreCombinedFn(score_mode='sum') + c = sum_op(a, b) + self.assertAlmostEqual(c.value, 1.2) + + sq_op = ModifierFn(modifier='square') + c = sum_op(a, sq_op(b)) + self.assertAlmostEqual(c.value, 0.99) + print(c) + + def test_normalizer(self): + a = get_unary_score(0.5) + norm_op = Normalizer1() + b = norm_op(a) + pprint(json.loads(b.explained)) + + a = get_unary_score(0.5) + norm_op = Normalizer2(2) + b = norm_op(a) + pprint(json.loads(b.explained)) + self.assertAlmostEqual(b.value, 0.8) + + a = get_unary_score(0.5) + norm_op = Normalizer3(2) + b = norm_op(a) + pprint(json.loads(b.explained)) + self.assertAlmostEqual(b.value, 0.7387961283389092) + + a = get_unary_score(0.5) + norm_op = Normalizer4(2) + b = norm_op(a) + pprint(json.loads(b.explained)) + self.assertEqual(b.value, 0.75) + + norm_op = ModifierFn('none') + b = norm_op(a) + pprint(json.loads(b.explained)) + self.assertEqual(b.value, 0.5) + + q_chunk = gnes_pb2.Chunk() + q_chunk.weight = 0.5 + q_chunk.offset = 1 + d_chunk = gnes_pb2.Chunk() + d_chunk.weight = 0.7 + d_chunk.offset = 2 + rel_score = get_unary_score(2) + _op = WeightedChunkScoreFn() + c = _op(rel_score, q_chunk, d_chunk) + pprint(json.loads(c.explained)) + self.assertAlmostEqual(c.value, 0.7)