fix(service): fix exception when no chunks

gnes-ai · Aug 30, 2019 · 417f41f · 417f41f
1 parent 21d88e4
commit 417f41f
Show file tree

Hide file tree

Showing 4 changed files with 21 additions and 16 deletions.
diff --git a/gnes/indexer/doc/leveldb.py b/gnes/indexer/doc/leveldb.py
@@ -47,8 +47,8 @@ def add(self, keys: List[int], docs: List['gnes_pb2.Document'], *args, **kwargs)
                 if self.drop_raw_bytes:
                     d.raw_bytes = b''
                 if self.drop_chunk_blob:
-                    for i in range(len(d.chunks)):
-                        d.chunks[i].ClearField('blob')
+                    for c in d.chunks:
+                        c.ClearField('blob')
                 doc = d.SerializeToString()
                 wb.put(doc_id, doc)
 

diff --git a/gnes/service/encoder.py b/gnes/service/encoder.py
@@ -38,9 +38,6 @@ def embed_chunks_in_docs(self, docs: Union[List['gnes_pb2.Document'], 'gnes_pb2.
         embeds = None
 
         for d in docs:
-            if not d.chunks:
-                raise ServiceError('document contains no chunks! doc: %s' % d)
-
             for c in d.chunks:
                 chunks.append(c)
                 if d.doc_type == gnes_pb2.Document.TEXT:
@@ -50,6 +47,8 @@ def embed_chunks_in_docs(self, docs: Union[List['gnes_pb2.Document'], 'gnes_pb2.
                 else:
                     raise ServiceError(
                         'chunk content is in type: %s, dont kow how to handle that' % c.WhichOneof('content'))
+            else:
+                self.logger.warning('document (doc_id=%s) contains no chunks!' % d.doc_id)
 
         if do_encoding:
             embeds = self._model.encode(contents)

diff --git a/gnes/service/indexer.py b/gnes/service/indexer.py
@@ -45,14 +45,15 @@ def _handler_chunk_index(self, msg: 'gnes_pb2.Message'):
 
         for d in msg.request.index.docs:
             if not d.chunks:
-                raise ServiceError('document contains no chunks! doc: %s' % d)
-
-            vecs += [blob2array(c.embedding) for c in d.chunks]
-            doc_ids += [d.doc_id] * len(d.chunks)
-            offsets += [c.offset for c in d.chunks]
-            weights += [c.weight for c in d.chunks]
-
-        self._model.add(list(zip(doc_ids, offsets)), np.concatenate(vecs, 0), weights)
+                self.logger.warning('document (doc_id=%s) contains no chunks!' % d.doc_id)
+            else:
+                vecs += [blob2array(c.embedding) for c in d.chunks]
+                doc_ids += [d.doc_id] * len(d.chunks)
+                offsets += [c.offset for c in d.chunks]
+                weights += [c.weight for c in d.chunks]
+
+        if vecs:
+            self._model.add(list(zip(doc_ids, offsets)), np.concatenate(vecs, 0), weights)
 
     def _handler_doc_index(self, msg: 'gnes_pb2.Message'):
         self._model.add([d.doc_id for d in msg.request.index.docs],

diff --git a/gnes/service/preprocessor.py b/gnes/service/preprocessor.py
@@ -28,13 +28,18 @@ def post_init(self):
     @handler.register(gnes_pb2.Request.TrainRequest)
     def _handler_train(self, msg: 'gnes_pb2.Message'):
         for d in msg.request.train.docs:
-            self._model.apply(d)
+            self._apply(d)
 
     @handler.register(gnes_pb2.Request.IndexRequest)
     def _handler_index(self, msg: 'gnes_pb2.Message'):
         for d in msg.request.index.docs:
-            self._model.apply(d)
+            self._apply(d)
 
     @handler.register(gnes_pb2.Request.QueryRequest)
     def _handler_query(self, msg: 'gnes_pb2.Message'):
-        self._model.apply(msg.request.search.query)
+        self._apply(msg.request.search.query)
+
+    def _apply(self, d: 'gnes_pb2.Document'):
+        self._model.apply(d)
+        if not d.chunks:
+            self.logger.warning('document (doc_id=%s) contains no chunks!' % d.doc_id)