Skip to content
This repository has been archived by the owner on Feb 22, 2020. It is now read-only.

Commit

Permalink
Merge pull request #259 from gnes-ai/score_fn_
Browse files Browse the repository at this point in the history
feat(scorefn): add offset tfidf bm25 qureycoord
  • Loading branch information
mergify[bot] authored Sep 12, 2019
2 parents 79d32db + 441e910 commit 227c697
Show file tree
Hide file tree
Showing 4 changed files with 165 additions and 55 deletions.
22 changes: 17 additions & 5 deletions gnes/indexer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# limitations under the License.
from functools import wraps
from typing import List, Any, Union, Callable, Tuple
from collections import defaultdict

import numpy as np

Expand All @@ -24,8 +25,8 @@

class BaseIndexer(TrainableBase):
def __init__(self,
normalize_fn: 'BaseScoreFn' = ModifierScoreFn(),
score_fn: 'BaseScoreFn' = ModifierScoreFn(),
normalize_fn: 'BaseScoreFn' = None,
score_fn: 'BaseScoreFn' = None,
is_big_score_similar: bool = False,
*args, **kwargs):
"""
Expand All @@ -35,11 +36,14 @@ def __init__(self,
:type is_big_score_similar: when set to true, then larger score means more similar
"""
super().__init__(*args, **kwargs)
self.normalize_fn = normalize_fn
self.score_fn = score_fn
self.normalize_fn = normalize_fn if normalize_fn else ModifierScoreFn()
self.score_fn = score_fn if score_fn else ModifierScoreFn()
self.normalize_fn._context = self
self.score_fn._context = self
self.is_big_score_similar = is_big_score_similar
self._num_docs = 0
self._num_chunks = 0
self._num_chunks_in_doc = defaultdict(int)

def add(self, keys: Any, docs: Any, weights: List[float], *args, **kwargs):
pass
Expand Down Expand Up @@ -99,7 +103,7 @@ def query_and_score(self, q_chunks: List['gnes_pb2.Chunk'], top_k: int, *args, *
dict(name='query_chunk',
offset=q_chunk.offset)])
_score = self.normalize_fn(_score)
_score = self.score_fn(_score, q_chunk, r.chunk)
_score = self.score_fn(_score, q_chunk, r.chunk, queried_results)
r.score.CopyFrom(_score)
results.append(r)
return results
Expand All @@ -111,6 +115,8 @@ def arg_wrapper(self, keys: List[Tuple[int, int]], *args, **kwargs):
doc_ids, _ = zip(*keys)
self._num_docs += len(set(doc_ids))
self._num_chunks += len(keys)
for doc_id in doc_ids:
self._num_chunks_in_doc[doc_id] += 1
return func(self, keys, *args, **kwargs)

return arg_wrapper
Expand Down Expand Up @@ -140,6 +146,12 @@ def num_chunks(self):
else:
return self._num_chunks

def num_chunks_in_doc(self, doc_id: int):
if self.helper_indexer:
return self.helper_indexer._num_chunks_in_doc[doc_id]
else:
self.logger.warning('enable helper_indexer to track num_chunks_in_doc')


class BaseDocIndexer(BaseIndexer):
"""Storing documents and contents """
Expand Down
8 changes: 6 additions & 2 deletions gnes/score_fn/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ class BaseScoreFn(TrainableBase):

warn_unnamed = False

def __init__(self, context=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self._context = context

def __call__(self, *args, **kwargs) -> 'gnes_pb2.Response.QueryResponse.ScoredResult.Score':
raise NotImplementedError

Expand Down Expand Up @@ -87,8 +91,8 @@ class ModifierScoreFn(BaseScoreFn):
score = modifier(factor * value)
"""

def __init__(self, modifier: str = 'none', factor: float = 1.0, factor_name: str = 'GivenConstant', *args,
**kwargs):
def __init__(self, modifier: str = 'none', factor: float = 1.0, factor_name: str = 'GivenConstant',
*args, **kwargs):
super().__init__(*args, **kwargs)
if modifier not in self.supported_ops:
raise AttributeError(
Expand Down
171 changes: 124 additions & 47 deletions gnes/score_fn/chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# limitations under the License.

from .base import get_unary_score, CombinedScoreFn
from typing import List, Tuple
import numpy as np


class WeightedChunkScoreFn(CombinedScoreFn):
Expand All @@ -32,50 +34,125 @@ def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Sco

return super().__call__(last_score, q_chunk_weight, d_chunk_weight)

# TODO: write this as a class
# @staticmethod
# def eq2(q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk',
# relevance, relevance_cls):
# """
# score = d_chunk.weight * relevance * offset_divergence * q_chunk.weight
# offset_divergence is calculated based on doc_type:
# TEXT && VIDEO && AUDIO: offset is 1-D
# IMAGE: offset is 2-D
# """
#
# def _cal_divergence(q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk'):
# if q_chunk.offset_nd and d_chunk.offset_nd:
# return 1 / (1 + np.sqrt((q_chunk.offset_nd[0] - d_chunk.offset_nd[0]) ** 2 +
# (q_chunk.offset_nd[1] - d_chunk.offset_nd[1]) ** 2))
# else:
# return np.abs(q_chunk.offset - d_chunk.offset)
#
# score = gnes_pb2.Response.QueryResponse.ScoredResult.Score()
#
# divergence = _cal_divergence(q_chunk, d_chunk)
# score.value = d_chunk.weight * relevance * divergence * q_chunk.weight
# score.explained = json.dumps({
# 'name': 'chunk_scorer_eq2',
# 'operand': [{'name': 'd_chunk_weight',
# 'value': float(d_chunk.weight),
# 'doc_id': d_chunk.doc_id,
# 'offset': d_chunk.offset},
# {'name': 'q_chunk_weight',
# 'value': float(q_chunk.weight),
# 'offset': q_chunk.offset},
# {'name': 'relevance',
# 'op': relevance_cls,
# 'operand': [{'name': 'doc_chunk',
# 'doc_id': d_chunk.doc_id,
# 'offset': d_chunk.offset},
# {'name': 'query_chunk',
# 'offset': q_chunk.offset}
# ],
# 'value': relevance
# },
# {'name': 'offset_divergence',
# 'value': float(divergence)}],
# 'op': 'prod',
# 'value': float(score.value)
# })
# return score

class WeightedChunkOffsetScoreFn(CombinedScoreFn):
"""
score = d_chunk.weight * relevance * offset_divergence * q_chunk.weight
offset_divergence is calculated based on doc_type:
TEXT && VIDEO && AUDIO: offset is 1-D
IMAGE: offset is 2-D
"""

def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score',
q_chunk: 'gnes_pb2.Chunk',
d_chunk: 'gnes_pb2.Chunk', *args, **kwargs):
q_chunk_weight = get_unary_score(value=q_chunk.weight,
name='query chunk weight',
offset=str(q_chunk.offset))
d_chunk_weight = get_unary_score(value=d_chunk.weight,
name='document chunk weight',
doc_id=d_chunk.doc_id,
offset=str(d_chunk.offset))
offset_divergence = get_unary_score(value=self._cal_divergence(q_chunk, d_chunk),
name='offset divergence')
return super().__call__(last_score, q_chunk_weight, d_chunk_weight, offset_divergence)

@staticmethod
def _cal_divergence(q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk'):
if q_chunk.offset_nd and d_chunk.offset_nd:
return 1 / (1 + np.sqrt((q_chunk.offset_nd[0] - d_chunk.offset_nd[0]) ** 2 +
(q_chunk.offset_nd[1] - d_chunk.offset_nd[1]) ** 2))
else:
return np.abs(q_chunk.offset - d_chunk.offset)


class CoordChunkScoreFn(CombinedScoreFn):
"""
score = relevance * query_coordination
query_coordination: #chunks return / #chunks in this doc(query doc)
"""

def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score',
q_chunk: 'gnes_pb2.Chunk',
d_chunk: 'gnes_pb2.Chunk',
queried_results: List[List[Tuple]],
*args, **kwargs):
query_coordination = get_unary_score(value=self._cal_query_coord(d_chunk, queried_results),
name='query coordination')
return super().__call__(last_score, query_coordination)

def _cal_query_coord(self, d_chunk: 'gnes_pb2.Chunk', queried_results: List[List[Tuple]]):
doc_id = d_chunk.doc_id
total_chunks = self._context.num_chunks_in_doc(doc_id)
queried_doc_id, _, _, _ = zip(*(queried_results[0]))
recall_chunks = queried_doc_id.count(doc_id)
return recall_chunks / total_chunks


class TFIDFChunkScoreFn(CombinedScoreFn):
"""
score = relevance * tf(q_chunk) * (idf(q_chunk)**2)
tf(q_chunk) is calculated based on the relevance of query result.
tf(q_chunk) = number of queried chunks where relevance >= threshold
idf(q_chunk) = log(total_chunks / tf(q_chunk) + 1)
"""

def __init__(self, threshold: float = 0.8, *args, **kwargs):
super().__init__(*args, **kwargs)
self.threshold = threshold

def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score',
q_chunk: 'gnes_pb2.Chunk',
d_chunk: 'gnes_pb2.Chunk',
queried_results: List[List[Tuple]],
*args, **kwargs):
tf_idf = get_unary_score(value=self._cal_tf_idf(queried_results),
name='query tf-idf')
return super().__call__(last_score, tf_idf)

def _cal_tf_idf(self, queried_results: List[List[Tuple]]):
_, _, _, queried_relevance = zip(*(queried_results[0]))
tf = len(list(filter(lambda x: x >= self.threshold, queried_relevance)))

total_chunks = self._context.num_chunks
idf = np.log10(total_chunks / (tf + 1))
return tf * (idf ** 2)


class BM25ChunkScoreFn(CombinedScoreFn):
"""
score = relevance * idf(q_chunk) * tf(q_chunk) * (k1 + 1) / (tf(q_chunk) +
k1 * (1 - b + b * (chunk_in_doc / avg_chunk_in_doc)))
in bm25 algorithm:
idf(q_chunk) = log(1 + (doc_count - f(q_chunk) +0.5) / (f(q_chunk) + 0.5)),
where f(q_chunk) is number of docs that contains q_chunk. In our system, this denotes number of docs
appearing in query results.
In elastic search, b = 0.75, k1 = 1.2
"""

def __init__(self, threshold: float = 0.8, *args, **kwargs):
super().__init__(*args, **kwargs)
self.threshold = threshold
self.k1 = 1.2
self.b = 0.75

def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score',
q_chunk: 'gnes_pb2.Chunk',
d_chunk: 'gnes_pb2.Chunk',
queried_results: List[List[Tuple]],
*args, **kwargs):
bm25 = get_unary_score(value=self._cal_bm25(d_chunk, queried_results),
name='query bm25')
return super().__call__(last_score, bm25)

def _cal_bm25(self, d_chunk: 'gnes_pb2.Chunk', queried_results: List[List[Tuple]]):
doc_id = d_chunk.doc_id
_, _, _, queried_relevance = zip(*(queried_results[0]))
tf = len(list(filter(lambda x: x >= self.threshold, queried_relevance)))

total_chunks = self._context.num_chunks
idf = np.log10(1 + (total_chunks - tf + 0.5) / (tf + 0.5))
return idf * tf * (self.k1 + 1) / (tf + self.k1 * (1 - self.b + self.b *
(self._context.num_chunks_in_doc(doc_id) * self._context.num_docs / self._context.num_chunks)))
19 changes: 18 additions & 1 deletion tests/test_score_fn.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from gnes.proto import gnes_pb2
from gnes.score_fn.base import get_unary_score, CombinedScoreFn, ModifierScoreFn
from gnes.score_fn.chunk import WeightedChunkScoreFn
from gnes.score_fn.chunk import WeightedChunkScoreFn, WeightedChunkOffsetScoreFn, CoordChunkScoreFn, TFIDFChunkScoreFn, BM25ChunkScoreFn
from gnes.score_fn.normalize import Normalizer1, Normalizer2, Normalizer3, Normalizer4


Expand All @@ -27,6 +27,23 @@ def test_op(self):
self.assertAlmostEqual(c.value, 0.99)
print(c)

def test_combine_score_fn(self):
from gnes.indexer.chunk.helper import ListKeyIndexer
from gnes.indexer.chunk.numpy import NumpyIndexer
from gnes.proto import array2blob
import numpy as np

q_chunk = gnes_pb2.Chunk()
q_chunk.doc_id = 2
q_chunk.weight = 0.3
q_chunk.offset = 0
q_chunk.embedding.CopyFrom(array2blob(np.array([3, 3, 3])))

for _fn in [WeightedChunkOffsetScoreFn, CoordChunkScoreFn, TFIDFChunkScoreFn, BM25ChunkScoreFn]:
indexer = NumpyIndexer(helper_indexer=ListKeyIndexer(), score_fn=_fn())
indexer.add(keys=[(0, 1), (1, 2)], vectors=np.array([[1, 1, 1], [2, 2, 2]]), weights=[0.5, 0.8])
queried_result = indexer.query_and_score(q_chunks=[q_chunk], top_k=2)

def test_normalizer(self):
a = get_unary_score(0.5)
norm_op = Normalizer1()
Expand Down

0 comments on commit 227c697

Please sign in to comment.