From c48adc2c7f0d495df37442865f8bfa7f0175d456 Mon Sep 17 00:00:00 2001 From: hanhxiao Date: Mon, 2 Sep 2019 15:49:47 +0800 Subject: [PATCH 1/4] fix(indexer): fix empty chunks indexing --- gnes/service/encoder.py | 2 +- gnes/service/indexer.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/gnes/service/encoder.py b/gnes/service/encoder.py index ed1c2cbc..0206fca1 100644 --- a/gnes/service/encoder.py +++ b/gnes/service/encoder.py @@ -53,7 +53,7 @@ def embed_chunks_in_docs(self, docs: Union[List['gnes_pb2.Document'], 'gnes_pb2. 'chunk content is in type: %s, dont kow how to handle that, ignored' % c.WhichOneof('content')) chunks.append(c) - if do_encoding: + if do_encoding and contents: embeds = self._model.encode(contents) if len(chunks) != embeds.shape[0]: raise ServiceError( diff --git a/gnes/service/indexer.py b/gnes/service/indexer.py index cf7baade..b20cbf5c 100644 --- a/gnes/service/indexer.py +++ b/gnes/service/indexer.py @@ -53,6 +53,8 @@ def _handler_chunk_index(self, msg: 'gnes_pb2.Message'): offsets += [c.offset for c in d.chunks] weights += [c.weight for c in d.chunks] + self.logger.info('%d %d %d %d' % (len(vecs), len(doc_ids), len(offsets), len(weights))) + self.logger.info(np.concatenate(vecs, 0).shape) if vecs: self._model.add(list(zip(doc_ids, offsets)), np.concatenate(vecs, 0), weights) From 2ba135dbfe1be58841694b25da9fa360fdcd7ced Mon Sep 17 00:00:00 2001 From: hanhxiao Date: Mon, 2 Sep 2019 16:15:05 +0800 Subject: [PATCH 2/4] fix(indexer): fix empty chunks indexing --- gnes/service/indexer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gnes/service/indexer.py b/gnes/service/indexer.py index b20cbf5c..e401df33 100644 --- a/gnes/service/indexer.py +++ b/gnes/service/indexer.py @@ -48,6 +48,8 @@ def _handler_chunk_index(self, msg: 'gnes_pb2.Message'): self.logger.warning('document (doc_id=%s) contains no chunks!' % d.doc_id) continue + for c in d.chunks: + self.logger.info(c.embedding) vecs += [blob2array(c.embedding) for c in d.chunks] doc_ids += [d.doc_id] * len(d.chunks) offsets += [c.offset for c in d.chunks] From 2d6c70fc389d2928714576ef1d58afb5805ce838 Mon Sep 17 00:00:00 2001 From: hanhxiao Date: Mon, 2 Sep 2019 16:29:37 +0800 Subject: [PATCH 3/4] fix(indexer): fix vec np.concat --- gnes/indexer/base.py | 2 +- gnes/service/indexer.py | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/gnes/indexer/base.py b/gnes/indexer/base.py index 9763048e..ba1b4eed 100644 --- a/gnes/indexer/base.py +++ b/gnes/indexer/base.py @@ -51,7 +51,7 @@ def query(self, keys: np.ndarray, top_k: int, *args, **kwargs) -> List[List[Tupl def query_and_score(self, q_chunks: List['gnes_pb2.Chunk'], top_k: int, *args, **kwargs) -> List[ 'gnes_pb2.Response.QueryResponse.ScoredResult']: vecs = [blob2array(c.embedding) for c in q_chunks] - queried_results = self.query(np.concatenate(vecs, 0), top_k=top_k) + queried_results = self.query(np.stack(vecs), top_k=top_k) results = [] for q_chunk, topk_chunks in zip(q_chunks, queried_results): for _doc_id, _offset, _weight, _relevance in topk_chunks: diff --git a/gnes/service/indexer.py b/gnes/service/indexer.py index e401df33..8550d951 100644 --- a/gnes/service/indexer.py +++ b/gnes/service/indexer.py @@ -48,17 +48,15 @@ def _handler_chunk_index(self, msg: 'gnes_pb2.Message'): self.logger.warning('document (doc_id=%s) contains no chunks!' % d.doc_id) continue - for c in d.chunks: - self.logger.info(c.embedding) vecs += [blob2array(c.embedding) for c in d.chunks] doc_ids += [d.doc_id] * len(d.chunks) offsets += [c.offset for c in d.chunks] weights += [c.weight for c in d.chunks] - self.logger.info('%d %d %d %d' % (len(vecs), len(doc_ids), len(offsets), len(weights))) - self.logger.info(np.concatenate(vecs, 0).shape) + # self.logger.info('%d %d %d %d' % (len(vecs), len(doc_ids), len(offsets), len(weights))) + # self.logger.info(np.stack(vecs).shape) if vecs: - self._model.add(list(zip(doc_ids, offsets)), np.concatenate(vecs, 0), weights) + self._model.add(list(zip(doc_ids, offsets)), np.stack(vecs), weights) def _handler_doc_index(self, msg: 'gnes_pb2.Message'): self._model.add([d.doc_id for d in msg.request.index.docs], From a46582508727c0247cf855fd6d88a1279e6ed05f Mon Sep 17 00:00:00 2001 From: hanhxiao Date: Mon, 2 Sep 2019 17:05:20 +0800 Subject: [PATCH 4/4] fix(scorer): fix np float conversion --- gnes/indexer/base.py | 22 +++++++++++----------- gnes/router/base.py | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/gnes/indexer/base.py b/gnes/indexer/base.py index ba1b4eed..c1ddc423 100644 --- a/gnes/indexer/base.py +++ b/gnes/indexer/base.py @@ -115,16 +115,16 @@ def eq1(q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk', score.explained = json.dumps({ 'name': 'chunk-eq1', 'operand': [{'name': 'd_chunk_weight', - 'value': d_chunk.weight, + 'value': float(d_chunk.weight), 'doc_id': d_chunk.doc_id, 'offset': d_chunk.offset}, {'name': 'q_chunk_weight', - 'value': q_chunk.weight, + 'value': float(q_chunk.weight), 'offset': q_chunk.offset}, {'name': 'relevance', - 'value': relevance}], + 'value': float(relevance)}], 'op': 'prod', - 'value': score.value + 'value': float(score.value) }) return score @@ -152,18 +152,18 @@ def _cal_divergence(q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk'): score.explained = json.dumps({ 'name': 'chunk-eq2', 'operand': [{'name': 'd_chunk_weight', - 'value': d_chunk.weight, + 'value': float(d_chunk.weight), 'doc_id': d_chunk.doc_id, 'offset': d_chunk.offset}, {'name': 'q_chunk_weight', - 'value': q_chunk.weight, + 'value': float(q_chunk.weight), 'offset': q_chunk.offset}, {'name': 'relevance', - 'value': relevance}, + 'value': float(relevance)}, {'name': 'offset_divergence', - 'value': divergence}], + 'value': float(divergence)}], 'op': 'prod', - 'value': score.value + 'value': float(score.value) }) return score @@ -184,10 +184,10 @@ def eq1(d: 'gnes_pb2.Document', 'name': 'doc-eq1', 'operand': [json.loads(s.explained), {'name': 'doc_weight', - 'value': d.weight, + 'value': float(d.weight), 'doc_id': d.doc_id}], 'op': 'prod', - 'value': s.value + 'value': float(s.value) }) return s diff --git a/gnes/router/base.py b/gnes/router/base.py index ec986dfe..092973d8 100644 --- a/gnes/router/base.py +++ b/gnes/router/base.py @@ -107,7 +107,7 @@ def apply(self, msg: 'gnes_pb2.Message', accum_msgs: List['gnes_pb2.Message'], * 'name': 'topk-reduce', 'op': self._reduce_op, 'operand': [json.loads(vv) for vv in v['explains']], - 'value': r.score.value + 'value': float(r.score.value) }) self.set_key(r, k)