From 4efea7263dc4558eb74dd2544715ba1fb0d5312d Mon Sep 17 00:00:00 2001
From: hanhxiao <hanhxiao@tencent.com>
Date: Thu, 29 Aug 2019 17:53:27 +0800
Subject: [PATCH] fix(service): raise except when empty chunk

---
 gnes/preprocessor/text/split.py |  2 +-
 gnes/service/encoder.py         |  3 +++
 gnes/service/indexer.py         | 10 +++++-----
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/gnes/preprocessor/text/split.py b/gnes/preprocessor/text/split.py
index 8bb60e97..5aa2b7a6 100644
--- a/gnes/preprocessor/text/split.py
+++ b/gnes/preprocessor/text/split.py
@@ -23,7 +23,7 @@
 
 class SentSplitPreprocessor(BaseTextPreprocessor):
     def __init__(self,
-                 min_sent_len: int = 8,
+                 min_sent_len: int = 1,
                  max_sent_len: int = 256,
                  deliminator: str = '.!?。！？',
                  is_json: bool = False,
diff --git a/gnes/service/encoder.py b/gnes/service/encoder.py
index ae28af6e..b7c07ed4 100644
--- a/gnes/service/encoder.py
+++ b/gnes/service/encoder.py
@@ -38,6 +38,9 @@ def embed_chunks_in_docs(self, docs: Union[List['gnes_pb2.Document'], 'gnes_pb2.
         embeds = None
 
         for d in docs:
+            if not d.chunks:
+                raise ServiceError('document contains no chunks! doc: %s' % d)
+
             for c in d.chunks:
                 chunks.append(c)
                 if d.doc_type == gnes_pb2.Document.TEXT:
diff --git a/gnes/service/indexer.py b/gnes/service/indexer.py
index 91306c53..57a0be33 100644
--- a/gnes/service/indexer.py
+++ b/gnes/service/indexer.py
@@ -46,11 +46,11 @@ def _handler_chunk_index(self, msg: 'gnes_pb2.Message'):
         for d in msg.request.index.docs:
             if not d.chunks:
                 raise ServiceError('document contains no chunks! doc: %s' % d)
-            else:
-                vecs += [blob2array(c.embedding) for c in d.chunks]
-                doc_ids += [d.doc_id] * len(d.chunks)
-                offsets += [c.offset for c in d.chunks]
-                weights += [c.weight for c in d.chunks]
+
+            vecs += [blob2array(c.embedding) for c in d.chunks]
+            doc_ids += [d.doc_id] * len(d.chunks)
+            offsets += [c.offset for c in d.chunks]
+            weights += [c.weight for c in d.chunks]
 
         self._model.add(list(zip(doc_ids, offsets)), np.concatenate(vecs, 0), weights)