Skip to content
This repository has been archived by the owner on Feb 22, 2020. It is now read-only.

Commit

Permalink
fix(service): raise except when empty chunk
Browse files Browse the repository at this point in the history
  • Loading branch information
hanhxiao committed Aug 29, 2019
1 parent 31bffeb commit 4efea72
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 6 deletions.
2 changes: 1 addition & 1 deletion gnes/preprocessor/text/split.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

class SentSplitPreprocessor(BaseTextPreprocessor):
def __init__(self,
min_sent_len: int = 8,
min_sent_len: int = 1,
max_sent_len: int = 256,
deliminator: str = '.!?。!?',
is_json: bool = False,
Expand Down
3 changes: 3 additions & 0 deletions gnes/service/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ def embed_chunks_in_docs(self, docs: Union[List['gnes_pb2.Document'], 'gnes_pb2.
embeds = None

for d in docs:
if not d.chunks:
raise ServiceError('document contains no chunks! doc: %s' % d)

for c in d.chunks:
chunks.append(c)
if d.doc_type == gnes_pb2.Document.TEXT:
Expand Down
10 changes: 5 additions & 5 deletions gnes/service/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,11 @@ def _handler_chunk_index(self, msg: 'gnes_pb2.Message'):
for d in msg.request.index.docs:
if not d.chunks:
raise ServiceError('document contains no chunks! doc: %s' % d)
else:
vecs += [blob2array(c.embedding) for c in d.chunks]
doc_ids += [d.doc_id] * len(d.chunks)
offsets += [c.offset for c in d.chunks]
weights += [c.weight for c in d.chunks]

vecs += [blob2array(c.embedding) for c in d.chunks]
doc_ids += [d.doc_id] * len(d.chunks)
offsets += [c.offset for c in d.chunks]
weights += [c.weight for c in d.chunks]

self._model.add(list(zip(doc_ids, offsets)), np.concatenate(vecs, 0), weights)

Expand Down

0 comments on commit 4efea72

Please sign in to comment.