From 4efea7263dc4558eb74dd2544715ba1fb0d5312d Mon Sep 17 00:00:00 2001 From: hanhxiao Date: Thu, 29 Aug 2019 17:53:27 +0800 Subject: [PATCH] fix(service): raise except when empty chunk --- gnes/preprocessor/text/split.py | 2 +- gnes/service/encoder.py | 3 +++ gnes/service/indexer.py | 10 +++++----- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/gnes/preprocessor/text/split.py b/gnes/preprocessor/text/split.py index 8bb60e97..5aa2b7a6 100644 --- a/gnes/preprocessor/text/split.py +++ b/gnes/preprocessor/text/split.py @@ -23,7 +23,7 @@ class SentSplitPreprocessor(BaseTextPreprocessor): def __init__(self, - min_sent_len: int = 8, + min_sent_len: int = 1, max_sent_len: int = 256, deliminator: str = '.!?。!?', is_json: bool = False, diff --git a/gnes/service/encoder.py b/gnes/service/encoder.py index ae28af6e..b7c07ed4 100644 --- a/gnes/service/encoder.py +++ b/gnes/service/encoder.py @@ -38,6 +38,9 @@ def embed_chunks_in_docs(self, docs: Union[List['gnes_pb2.Document'], 'gnes_pb2. embeds = None for d in docs: + if not d.chunks: + raise ServiceError('document contains no chunks! doc: %s' % d) + for c in d.chunks: chunks.append(c) if d.doc_type == gnes_pb2.Document.TEXT: diff --git a/gnes/service/indexer.py b/gnes/service/indexer.py index 91306c53..57a0be33 100644 --- a/gnes/service/indexer.py +++ b/gnes/service/indexer.py @@ -46,11 +46,11 @@ def _handler_chunk_index(self, msg: 'gnes_pb2.Message'): for d in msg.request.index.docs: if not d.chunks: raise ServiceError('document contains no chunks! doc: %s' % d) - else: - vecs += [blob2array(c.embedding) for c in d.chunks] - doc_ids += [d.doc_id] * len(d.chunks) - offsets += [c.offset for c in d.chunks] - weights += [c.weight for c in d.chunks] + + vecs += [blob2array(c.embedding) for c in d.chunks] + doc_ids += [d.doc_id] * len(d.chunks) + offsets += [c.offset for c in d.chunks] + weights += [c.weight for c in d.chunks] self._model.add(list(zip(doc_ids, offsets)), np.concatenate(vecs, 0), weights)