fix(service): raise except when empty chunk

gnes-ai · Aug 29, 2019 · 4efea72 · 4efea72
1 parent 31bffeb
commit 4efea72
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 6 deletions.
diff --git a/gnes/preprocessor/text/split.py b/gnes/preprocessor/text/split.py
@@ -23,7 +23,7 @@
 
 class SentSplitPreprocessor(BaseTextPreprocessor):
     def __init__(self,
-                 min_sent_len: int = 8,
+                 min_sent_len: int = 1,
                  max_sent_len: int = 256,
                  deliminator: str = '.!?。！？',
                  is_json: bool = False,

diff --git a/gnes/service/encoder.py b/gnes/service/encoder.py
@@ -38,6 +38,9 @@ def embed_chunks_in_docs(self, docs: Union[List['gnes_pb2.Document'], 'gnes_pb2.
         embeds = None
 
         for d in docs:
+            if not d.chunks:
+                raise ServiceError('document contains no chunks! doc: %s' % d)
+
             for c in d.chunks:
                 chunks.append(c)
                 if d.doc_type == gnes_pb2.Document.TEXT:

diff --git a/gnes/service/indexer.py b/gnes/service/indexer.py
@@ -46,11 +46,11 @@ def _handler_chunk_index(self, msg: 'gnes_pb2.Message'):
         for d in msg.request.index.docs:
             if not d.chunks:
                 raise ServiceError('document contains no chunks! doc: %s' % d)
-            else:
-                vecs += [blob2array(c.embedding) for c in d.chunks]
-                doc_ids += [d.doc_id] * len(d.chunks)
-                offsets += [c.offset for c in d.chunks]
-                weights += [c.weight for c in d.chunks]
+
+            vecs += [blob2array(c.embedding) for c in d.chunks]
+            doc_ids += [d.doc_id] * len(d.chunks)
+            offsets += [c.offset for c in d.chunks]
+            weights += [c.weight for c in d.chunks]
 
         self._model.add(list(zip(doc_ids, offsets)), np.concatenate(vecs, 0), weights)