Merge pull request #259 from gnes-ai/score_fn_

feat(scorefn): add offset tfidf bm25 qureycoord
gnes-ai · Sep 12, 2019 · 227c697 · 227c697
2 parents 79d32db + 441e910
commit 227c697
Show file tree

Hide file tree

Showing 4 changed files with 165 additions and 55 deletions.
diff --git a/gnes/indexer/base.py b/gnes/indexer/base.py
@@ -14,6 +14,7 @@
 #  limitations under the License.
 from functools import wraps
 from typing import List, Any, Union, Callable, Tuple
+from collections import defaultdict
 
 import numpy as np
 
@@ -24,8 +25,8 @@
 
 class BaseIndexer(TrainableBase):
     def __init__(self,
-                 normalize_fn: 'BaseScoreFn' = ModifierScoreFn(),
-                 score_fn: 'BaseScoreFn' = ModifierScoreFn(),
+                 normalize_fn: 'BaseScoreFn' = None,
+                 score_fn: 'BaseScoreFn' = None,
                  is_big_score_similar: bool = False,
                  *args, **kwargs):
         """
@@ -35,11 +36,14 @@ def __init__(self,
         :type is_big_score_similar: when set to true, then larger score means more similar
         """
         super().__init__(*args, **kwargs)
-        self.normalize_fn = normalize_fn
-        self.score_fn = score_fn
+        self.normalize_fn = normalize_fn if normalize_fn else ModifierScoreFn()
+        self.score_fn = score_fn if score_fn else ModifierScoreFn()
+        self.normalize_fn._context = self
+        self.score_fn._context = self
         self.is_big_score_similar = is_big_score_similar
         self._num_docs = 0
         self._num_chunks = 0
+        self._num_chunks_in_doc = defaultdict(int)
 
     def add(self, keys: Any, docs: Any, weights: List[float], *args, **kwargs):
         pass
@@ -99,7 +103,7 @@ def query_and_score(self, q_chunks: List['gnes_pb2.Chunk'], top_k: int, *args, *
                                              dict(name='query_chunk',
                                                   offset=q_chunk.offset)])
                 _score = self.normalize_fn(_score)
-                _score = self.score_fn(_score, q_chunk, r.chunk)
+                _score = self.score_fn(_score, q_chunk, r.chunk, queried_results)
                 r.score.CopyFrom(_score)
                 results.append(r)
         return results
@@ -111,6 +115,8 @@ def arg_wrapper(self, keys: List[Tuple[int, int]], *args, **kwargs):
             doc_ids, _ = zip(*keys)
             self._num_docs += len(set(doc_ids))
             self._num_chunks += len(keys)
+            for doc_id in doc_ids:
+                self._num_chunks_in_doc[doc_id] += 1
             return func(self, keys, *args, **kwargs)
 
         return arg_wrapper
@@ -140,6 +146,12 @@ def num_chunks(self):
         else:
             return self._num_chunks
 
+    def num_chunks_in_doc(self, doc_id: int):
+        if self.helper_indexer:
+            return self.helper_indexer._num_chunks_in_doc[doc_id]
+        else:
+            self.logger.warning('enable helper_indexer to track num_chunks_in_doc')
+
 
 class BaseDocIndexer(BaseIndexer):
     """Storing documents and contents """

diff --git a/gnes/score_fn/base.py b/gnes/score_fn/base.py
@@ -36,6 +36,10 @@ class BaseScoreFn(TrainableBase):
 
     warn_unnamed = False
 
+    def __init__(self, context=None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._context = context
+
     def __call__(self, *args, **kwargs) -> 'gnes_pb2.Response.QueryResponse.ScoredResult.Score':
         raise NotImplementedError
 
@@ -87,8 +91,8 @@ class ModifierScoreFn(BaseScoreFn):
     score = modifier(factor * value)
     """
 
-    def __init__(self, modifier: str = 'none', factor: float = 1.0, factor_name: str = 'GivenConstant', *args,
-                 **kwargs):
+    def __init__(self, modifier: str = 'none', factor: float = 1.0, factor_name: str = 'GivenConstant',
+                 *args, **kwargs):
         super().__init__(*args, **kwargs)
         if modifier not in self.supported_ops:
             raise AttributeError(

diff --git a/gnes/score_fn/chunk.py b/gnes/score_fn/chunk.py
@@ -14,6 +14,8 @@
 #  limitations under the License.
 
 from .base import get_unary_score, CombinedScoreFn
+from typing import List, Tuple
+import numpy as np
 
 
 class WeightedChunkScoreFn(CombinedScoreFn):
@@ -32,50 +34,125 @@ def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Sco
 
         return super().__call__(last_score, q_chunk_weight, d_chunk_weight)
 
-# TODO: write this as a class
-#     @staticmethod
-#     def eq2(q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk',
-#             relevance, relevance_cls):
-#         """
-#         score = d_chunk.weight * relevance * offset_divergence * q_chunk.weight
-#         offset_divergence is calculated based on doc_type:
-#             TEXT && VIDEO && AUDIO: offset is 1-D
-#             IMAGE: offset is 2-D
-#         """
-#
-#         def _cal_divergence(q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk'):
-#             if q_chunk.offset_nd and d_chunk.offset_nd:
-#                 return 1 / (1 + np.sqrt((q_chunk.offset_nd[0] - d_chunk.offset_nd[0]) ** 2 +
-#                                         (q_chunk.offset_nd[1] - d_chunk.offset_nd[1]) ** 2))
-#             else:
-#                 return np.abs(q_chunk.offset - d_chunk.offset)
-#
-#         score = gnes_pb2.Response.QueryResponse.ScoredResult.Score()
-#
-#         divergence = _cal_divergence(q_chunk, d_chunk)
-#         score.value = d_chunk.weight * relevance * divergence * q_chunk.weight
-#         score.explained = json.dumps({
-#             'name': 'chunk_scorer_eq2',
-#             'operand': [{'name': 'd_chunk_weight',
-#                          'value': float(d_chunk.weight),
-#                          'doc_id': d_chunk.doc_id,
-#                          'offset': d_chunk.offset},
-#                         {'name': 'q_chunk_weight',
-#                          'value': float(q_chunk.weight),
-#                          'offset': q_chunk.offset},
-#                         {'name': 'relevance',
-#                          'op': relevance_cls,
-#                          'operand': [{'name': 'doc_chunk',
-#                                       'doc_id': d_chunk.doc_id,
-#                                       'offset': d_chunk.offset},
-#                                      {'name': 'query_chunk',
-#                                       'offset': q_chunk.offset}
-#                                      ],
-#                          'value': relevance
-#                          },
-#                         {'name': 'offset_divergence',
-#                          'value': float(divergence)}],
-#             'op': 'prod',
-#             'value': float(score.value)
-#         })
-#         return score
+
+class WeightedChunkOffsetScoreFn(CombinedScoreFn):
+    """
+    score = d_chunk.weight * relevance * offset_divergence * q_chunk.weight
+    offset_divergence is calculated based on doc_type:
+        TEXT && VIDEO && AUDIO: offset is 1-D
+        IMAGE: offset is 2-D
+    """
+
+    def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score',
+                 q_chunk: 'gnes_pb2.Chunk',
+                 d_chunk: 'gnes_pb2.Chunk', *args, **kwargs):
+        q_chunk_weight = get_unary_score(value=q_chunk.weight,
+                                         name='query chunk weight',
+                                         offset=str(q_chunk.offset))
+        d_chunk_weight = get_unary_score(value=d_chunk.weight,
+                                         name='document chunk weight',
+                                         doc_id=d_chunk.doc_id,
+                                         offset=str(d_chunk.offset))
+        offset_divergence = get_unary_score(value=self._cal_divergence(q_chunk, d_chunk),
+                                            name='offset divergence')
+        return super().__call__(last_score, q_chunk_weight, d_chunk_weight, offset_divergence)
+
+    @staticmethod
+    def _cal_divergence(q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk'):
+        if q_chunk.offset_nd and d_chunk.offset_nd:
+            return 1 / (1 + np.sqrt((q_chunk.offset_nd[0] - d_chunk.offset_nd[0]) ** 2 +
+                                    (q_chunk.offset_nd[1] - d_chunk.offset_nd[1]) ** 2))
+        else:
+            return np.abs(q_chunk.offset - d_chunk.offset)
+
+
+class CoordChunkScoreFn(CombinedScoreFn):
+    """
+    score = relevance * query_coordination
+    query_coordination: #chunks return / #chunks in this doc(query doc)
+    """
+
+    def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score',
+                 q_chunk: 'gnes_pb2.Chunk',
+                 d_chunk: 'gnes_pb2.Chunk',
+                 queried_results: List[List[Tuple]],
+                 *args, **kwargs):
+        query_coordination = get_unary_score(value=self._cal_query_coord(d_chunk, queried_results),
+                                             name='query coordination')
+        return super().__call__(last_score, query_coordination)
+
+    def _cal_query_coord(self, d_chunk: 'gnes_pb2.Chunk', queried_results: List[List[Tuple]]):
+        doc_id = d_chunk.doc_id
+        total_chunks = self._context.num_chunks_in_doc(doc_id)
+        queried_doc_id, _, _, _ = zip(*(queried_results[0]))
+        recall_chunks = queried_doc_id.count(doc_id)
+        return recall_chunks / total_chunks
+
+
+class TFIDFChunkScoreFn(CombinedScoreFn):
+    """
+    score = relevance * tf(q_chunk) * (idf(q_chunk)**2)
+    tf(q_chunk) is calculated based on the relevance of query result.
+    tf(q_chunk) = number of queried chunks where relevance >= threshold
+    idf(q_chunk) = log(total_chunks / tf(q_chunk) + 1)
+    """
+
+    def __init__(self, threshold: float = 0.8, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.threshold = threshold
+
+    def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score',
+                 q_chunk: 'gnes_pb2.Chunk',
+                 d_chunk: 'gnes_pb2.Chunk',
+                 queried_results: List[List[Tuple]],
+                 *args, **kwargs):
+        tf_idf = get_unary_score(value=self._cal_tf_idf(queried_results),
+                                             name='query tf-idf')
+        return super().__call__(last_score, tf_idf)
+
+    def _cal_tf_idf(self, queried_results: List[List[Tuple]]):
+        _, _, _, queried_relevance = zip(*(queried_results[0]))
+        tf = len(list(filter(lambda x: x >= self.threshold, queried_relevance)))
+
+        total_chunks = self._context.num_chunks
+        idf = np.log10(total_chunks / (tf + 1))
+        return tf * (idf ** 2)
+
+
+class BM25ChunkScoreFn(CombinedScoreFn):
+    """
+    score = relevance * idf(q_chunk) * tf(q_chunk) * (k1 + 1) / (tf(q_chunk) +
+                            k1 * (1 - b + b * (chunk_in_doc / avg_chunk_in_doc)))
+
+    in bm25 algorithm:
+             idf(q_chunk) = log(1 + (doc_count - f(q_chunk) +0.5) / (f(q_chunk) + 0.5)),
+    where f(q_chunk) is number of docs that contains q_chunk. In our system, this denotes number of docs
+    appearing in query results.
+
+    In elastic search, b = 0.75, k1 = 1.2
+    """
+
+    def __init__(self, threshold: float = 0.8, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.threshold = threshold
+        self.k1 = 1.2
+        self.b = 0.75
+
+    def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score',
+                 q_chunk: 'gnes_pb2.Chunk',
+                 d_chunk: 'gnes_pb2.Chunk',
+                 queried_results: List[List[Tuple]],
+                 *args, **kwargs):
+        bm25 = get_unary_score(value=self._cal_bm25(d_chunk, queried_results),
+                                             name='query bm25')
+        return super().__call__(last_score, bm25)
+
+    def _cal_bm25(self, d_chunk: 'gnes_pb2.Chunk', queried_results: List[List[Tuple]]):
+        doc_id = d_chunk.doc_id
+        _, _, _, queried_relevance = zip(*(queried_results[0]))
+        tf = len(list(filter(lambda x: x >= self.threshold, queried_relevance)))
+
+        total_chunks = self._context.num_chunks
+        idf = np.log10(1 + (total_chunks - tf + 0.5) / (tf + 0.5))
+        return idf * tf * (self.k1 + 1) / (tf + self.k1 * (1 - self.b + self.b *
+                                (self._context.num_chunks_in_doc(doc_id) * self._context.num_docs / self._context.num_chunks)))
diff --git a/tests/test_score_fn.py b/tests/test_score_fn.py
@@ -4,7 +4,7 @@
 
 from gnes.proto import gnes_pb2
 from gnes.score_fn.base import get_unary_score, CombinedScoreFn, ModifierScoreFn
-from gnes.score_fn.chunk import WeightedChunkScoreFn
+from gnes.score_fn.chunk import WeightedChunkScoreFn, WeightedChunkOffsetScoreFn, CoordChunkScoreFn, TFIDFChunkScoreFn, BM25ChunkScoreFn
 from gnes.score_fn.normalize import Normalizer1, Normalizer2, Normalizer3, Normalizer4
 
 
@@ -27,6 +27,23 @@ def test_op(self):
         self.assertAlmostEqual(c.value, 0.99)
         print(c)
 
+    def test_combine_score_fn(self):
+        from gnes.indexer.chunk.helper import ListKeyIndexer
+        from gnes.indexer.chunk.numpy import NumpyIndexer
+        from gnes.proto import array2blob
+        import numpy as np
+
+        q_chunk = gnes_pb2.Chunk()
+        q_chunk.doc_id = 2
+        q_chunk.weight = 0.3
+        q_chunk.offset = 0
+        q_chunk.embedding.CopyFrom(array2blob(np.array([3, 3, 3])))
+
+        for _fn in [WeightedChunkOffsetScoreFn, CoordChunkScoreFn, TFIDFChunkScoreFn, BM25ChunkScoreFn]:
+            indexer = NumpyIndexer(helper_indexer=ListKeyIndexer(), score_fn=_fn())
+            indexer.add(keys=[(0, 1), (1, 2)], vectors=np.array([[1, 1, 1], [2, 2, 2]]), weights=[0.5, 0.8])
+            queried_result = indexer.query_and_score(q_chunks=[q_chunk], top_k=2)
+
     def test_normalizer(self):
         a = get_unary_score(0.5)
         norm_op = Normalizer1()