diff --git a/gnes/service/encoder.py b/gnes/service/encoder.py index bcda5ceb..8e4c962e 100644 --- a/gnes/service/encoder.py +++ b/gnes/service/encoder.py @@ -38,17 +38,19 @@ def embed_chunks_in_docs(self, docs: Union[List['gnes_pb2.Document'], 'gnes_pb2. embeds = None for d in docs: + if not d.chunks: + self.logger.warning('document (doc_id=%s) contains no chunks!' % d.doc_id) + continue + for c in d.chunks: - chunks.append(c) if d.doc_type == gnes_pb2.Document.TEXT: contents.append(c.text) elif getattr(c, c.WhichOneof('content')) == 'blob': contents.append(blob2array(c.blob)) else: - raise ServiceError( - 'chunk content is in type: %s, dont kow how to handle that' % c.WhichOneof('content')) - else: - self.logger.warning('document (doc_id=%s) contains no chunks!' % d.doc_id) + self.logger.warning( + 'chunk content is in type: %s, dont kow how to handle that, ignored' % c.WhichOneof('content')) + chunks.append(c) if do_encoding: embeds = self._model.encode(contents) diff --git a/gnes/service/indexer.py b/gnes/service/indexer.py index 84a9e1bd..cf7baade 100644 --- a/gnes/service/indexer.py +++ b/gnes/service/indexer.py @@ -46,11 +46,12 @@ def _handler_chunk_index(self, msg: 'gnes_pb2.Message'): for d in msg.request.index.docs: if not d.chunks: self.logger.warning('document (doc_id=%s) contains no chunks!' % d.doc_id) - else: - vecs += [blob2array(c.embedding) for c in d.chunks] - doc_ids += [d.doc_id] * len(d.chunks) - offsets += [c.offset for c in d.chunks] - weights += [c.weight for c in d.chunks] + continue + + vecs += [blob2array(c.embedding) for c in d.chunks] + doc_ids += [d.doc_id] * len(d.chunks) + offsets += [c.offset for c in d.chunks] + weights += [c.weight for c in d.chunks] if vecs: self._model.add(list(zip(doc_ids, offsets)), np.concatenate(vecs, 0), weights)