Skip to content
This repository has been archived by the owner on Feb 22, 2020. It is now read-only.

Commit

Permalink
fix(service): fix exception when no chunks
Browse files Browse the repository at this point in the history
  • Loading branch information
hanhxiao committed Aug 30, 2019
1 parent 21d88e4 commit 417f41f
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 16 deletions.
4 changes: 2 additions & 2 deletions gnes/indexer/doc/leveldb.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ def add(self, keys: List[int], docs: List['gnes_pb2.Document'], *args, **kwargs)
if self.drop_raw_bytes:
d.raw_bytes = b''
if self.drop_chunk_blob:
for i in range(len(d.chunks)):
d.chunks[i].ClearField('blob')
for c in d.chunks:
c.ClearField('blob')
doc = d.SerializeToString()
wb.put(doc_id, doc)

Expand Down
5 changes: 2 additions & 3 deletions gnes/service/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,6 @@ def embed_chunks_in_docs(self, docs: Union[List['gnes_pb2.Document'], 'gnes_pb2.
embeds = None

for d in docs:
if not d.chunks:
raise ServiceError('document contains no chunks! doc: %s' % d)

for c in d.chunks:
chunks.append(c)
if d.doc_type == gnes_pb2.Document.TEXT:
Expand All @@ -50,6 +47,8 @@ def embed_chunks_in_docs(self, docs: Union[List['gnes_pb2.Document'], 'gnes_pb2.
else:
raise ServiceError(
'chunk content is in type: %s, dont kow how to handle that' % c.WhichOneof('content'))
else:
self.logger.warning('document (doc_id=%s) contains no chunks!' % d.doc_id)

if do_encoding:
embeds = self._model.encode(contents)
Expand Down
17 changes: 9 additions & 8 deletions gnes/service/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,15 @@ def _handler_chunk_index(self, msg: 'gnes_pb2.Message'):

for d in msg.request.index.docs:
if not d.chunks:
raise ServiceError('document contains no chunks! doc: %s' % d)

vecs += [blob2array(c.embedding) for c in d.chunks]
doc_ids += [d.doc_id] * len(d.chunks)
offsets += [c.offset for c in d.chunks]
weights += [c.weight for c in d.chunks]

self._model.add(list(zip(doc_ids, offsets)), np.concatenate(vecs, 0), weights)
self.logger.warning('document (doc_id=%s) contains no chunks!' % d.doc_id)
else:
vecs += [blob2array(c.embedding) for c in d.chunks]
doc_ids += [d.doc_id] * len(d.chunks)
offsets += [c.offset for c in d.chunks]
weights += [c.weight for c in d.chunks]

if vecs:
self._model.add(list(zip(doc_ids, offsets)), np.concatenate(vecs, 0), weights)

def _handler_doc_index(self, msg: 'gnes_pb2.Message'):
self._model.add([d.doc_id for d in msg.request.index.docs],
Expand Down
11 changes: 8 additions & 3 deletions gnes/service/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,18 @@ def post_init(self):
@handler.register(gnes_pb2.Request.TrainRequest)
def _handler_train(self, msg: 'gnes_pb2.Message'):
for d in msg.request.train.docs:
self._model.apply(d)
self._apply(d)

@handler.register(gnes_pb2.Request.IndexRequest)
def _handler_index(self, msg: 'gnes_pb2.Message'):
for d in msg.request.index.docs:
self._model.apply(d)
self._apply(d)

@handler.register(gnes_pb2.Request.QueryRequest)
def _handler_query(self, msg: 'gnes_pb2.Message'):
self._model.apply(msg.request.search.query)
self._apply(msg.request.search.query)

def _apply(self, d: 'gnes_pb2.Document'):
self._model.apply(d)
if not d.chunks:
self.logger.warning('document (doc_id=%s) contains no chunks!' % d.doc_id)

0 comments on commit 417f41f

Please sign in to comment.