Merge pull request #186 from gnes-ai/fix-empty-chunk-inde

fix(indexer): fix vec np.concat
gnes-ai · Sep 2, 2019 · 3a18111 · 3a18111
2 parents 3524152 + a465825
commit 3a18111
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 15 deletions.
diff --git a/gnes/indexer/base.py b/gnes/indexer/base.py
@@ -51,7 +51,7 @@ def query(self, keys: np.ndarray, top_k: int, *args, **kwargs) -> List[List[Tupl
     def query_and_score(self, q_chunks: List['gnes_pb2.Chunk'], top_k: int, *args, **kwargs) -> List[
         'gnes_pb2.Response.QueryResponse.ScoredResult']:
         vecs = [blob2array(c.embedding) for c in q_chunks]
-        queried_results = self.query(np.concatenate(vecs, 0), top_k=top_k)
+        queried_results = self.query(np.stack(vecs), top_k=top_k)
         results = []
         for q_chunk, topk_chunks in zip(q_chunks, queried_results):
             for _doc_id, _offset, _weight, _relevance in topk_chunks:
@@ -115,16 +115,16 @@ def eq1(q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk',
         score.explained = json.dumps({
             'name': 'chunk-eq1',
             'operand': [{'name': 'd_chunk_weight',
-                         'value': d_chunk.weight,
+                         'value': float(d_chunk.weight),
                          'doc_id': d_chunk.doc_id,
                          'offset': d_chunk.offset},
                         {'name': 'q_chunk_weight',
-                         'value': q_chunk.weight,
+                         'value': float(q_chunk.weight),
                          'offset': q_chunk.offset},
                         {'name': 'relevance',
-                         'value': relevance}],
+                         'value': float(relevance)}],
             'op': 'prod',
-            'value': score.value
+            'value': float(score.value)
         })
         return score
 
@@ -152,18 +152,18 @@ def _cal_divergence(q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk'):
         score.explained = json.dumps({
             'name': 'chunk-eq2',
             'operand': [{'name': 'd_chunk_weight',
-                         'value': d_chunk.weight,
+                         'value': float(d_chunk.weight),
                          'doc_id': d_chunk.doc_id,
                          'offset': d_chunk.offset},
                         {'name': 'q_chunk_weight',
-                         'value': q_chunk.weight,
+                         'value': float(q_chunk.weight),
                          'offset': q_chunk.offset},
                         {'name': 'relevance',
-                         'value': relevance},
+                         'value': float(relevance)},
                         {'name': 'offset_divergence',
-                         'value': divergence}],
+                         'value': float(divergence)}],
             'op': 'prod',
-            'value': score.value
+            'value': float(score.value)
         })
         return score
 
@@ -184,10 +184,10 @@ def eq1(d: 'gnes_pb2.Document',
             'name': 'doc-eq1',
             'operand': [json.loads(s.explained),
                         {'name': 'doc_weight',
-                         'value': d.weight,
+                         'value': float(d.weight),
                          'doc_id': d.doc_id}],
             'op': 'prod',
-            'value': s.value
+            'value': float(s.value)
         })
         return s
 

diff --git a/gnes/router/base.py b/gnes/router/base.py
@@ -107,7 +107,7 @@ def apply(self, msg: 'gnes_pb2.Message', accum_msgs: List['gnes_pb2.Message'], *
                 'name': 'topk-reduce',
                 'op': self._reduce_op,
                 'operand': [json.loads(vv) for vv in v['explains']],
-                'value': r.score.value
+                'value': float(r.score.value)
             })
             self.set_key(r, k)
 

diff --git a/gnes/service/encoder.py b/gnes/service/encoder.py
@@ -53,7 +53,7 @@ def embed_chunks_in_docs(self, docs: Union[List['gnes_pb2.Document'], 'gnes_pb2.
                         'chunk content is in type: %s, dont kow how to handle that, ignored' % c.WhichOneof('content'))
                 chunks.append(c)
 
-        if do_encoding:
+        if do_encoding and contents:
             embeds = self._model.encode(contents)
             if len(chunks) != embeds.shape[0]:
                 raise ServiceError(

diff --git a/gnes/service/indexer.py b/gnes/service/indexer.py
@@ -53,8 +53,10 @@ def _handler_chunk_index(self, msg: 'gnes_pb2.Message'):
             offsets += [c.offset for c in d.chunks]
             weights += [c.weight for c in d.chunks]
 
+            # self.logger.info('%d %d %d %d' % (len(vecs), len(doc_ids), len(offsets), len(weights)))
+            # self.logger.info(np.stack(vecs).shape)
         if vecs:
-            self._model.add(list(zip(doc_ids, offsets)), np.concatenate(vecs, 0), weights)
+            self._model.add(list(zip(doc_ids, offsets)), np.stack(vecs), weights)
 
     def _handler_doc_index(self, msg: 'gnes_pb2.Message'):
         self._model.add([d.doc_id for d in msg.request.index.docs],