diff --git a/gnes/indexer/doc/leveldb.py b/gnes/indexer/doc/leveldb.py index a2366ce3..cbcab108 100644 --- a/gnes/indexer/doc/leveldb.py +++ b/gnes/indexer/doc/leveldb.py @@ -47,8 +47,8 @@ def add(self, keys: List[int], docs: List['gnes_pb2.Document'], *args, **kwargs) if self.drop_raw_bytes: d.raw_bytes = b'' if self.drop_chunk_blob: - for i in range(len(d.chunks)): - d.chunks[i].ClearField('blob') + for c in d.chunks: + c.ClearField('blob') doc = d.SerializeToString() wb.put(doc_id, doc) diff --git a/gnes/service/encoder.py b/gnes/service/encoder.py index b7c07ed4..bcda5ceb 100644 --- a/gnes/service/encoder.py +++ b/gnes/service/encoder.py @@ -38,9 +38,6 @@ def embed_chunks_in_docs(self, docs: Union[List['gnes_pb2.Document'], 'gnes_pb2. embeds = None for d in docs: - if not d.chunks: - raise ServiceError('document contains no chunks! doc: %s' % d) - for c in d.chunks: chunks.append(c) if d.doc_type == gnes_pb2.Document.TEXT: @@ -50,6 +47,8 @@ def embed_chunks_in_docs(self, docs: Union[List['gnes_pb2.Document'], 'gnes_pb2. else: raise ServiceError( 'chunk content is in type: %s, dont kow how to handle that' % c.WhichOneof('content')) + else: + self.logger.warning('document (doc_id=%s) contains no chunks!' % d.doc_id) if do_encoding: embeds = self._model.encode(contents) diff --git a/gnes/service/indexer.py b/gnes/service/indexer.py index 57a0be33..84a9e1bd 100644 --- a/gnes/service/indexer.py +++ b/gnes/service/indexer.py @@ -45,14 +45,15 @@ def _handler_chunk_index(self, msg: 'gnes_pb2.Message'): for d in msg.request.index.docs: if not d.chunks: - raise ServiceError('document contains no chunks! doc: %s' % d) - - vecs += [blob2array(c.embedding) for c in d.chunks] - doc_ids += [d.doc_id] * len(d.chunks) - offsets += [c.offset for c in d.chunks] - weights += [c.weight for c in d.chunks] - - self._model.add(list(zip(doc_ids, offsets)), np.concatenate(vecs, 0), weights) + self.logger.warning('document (doc_id=%s) contains no chunks!' % d.doc_id) + else: + vecs += [blob2array(c.embedding) for c in d.chunks] + doc_ids += [d.doc_id] * len(d.chunks) + offsets += [c.offset for c in d.chunks] + weights += [c.weight for c in d.chunks] + + if vecs: + self._model.add(list(zip(doc_ids, offsets)), np.concatenate(vecs, 0), weights) def _handler_doc_index(self, msg: 'gnes_pb2.Message'): self._model.add([d.doc_id for d in msg.request.index.docs], diff --git a/gnes/service/preprocessor.py b/gnes/service/preprocessor.py index 746d0d85..9f45d8f8 100644 --- a/gnes/service/preprocessor.py +++ b/gnes/service/preprocessor.py @@ -28,13 +28,18 @@ def post_init(self): @handler.register(gnes_pb2.Request.TrainRequest) def _handler_train(self, msg: 'gnes_pb2.Message'): for d in msg.request.train.docs: - self._model.apply(d) + self._apply(d) @handler.register(gnes_pb2.Request.IndexRequest) def _handler_index(self, msg: 'gnes_pb2.Message'): for d in msg.request.index.docs: - self._model.apply(d) + self._apply(d) @handler.register(gnes_pb2.Request.QueryRequest) def _handler_query(self, msg: 'gnes_pb2.Message'): - self._model.apply(msg.request.search.query) + self._apply(msg.request.search.query) + + def _apply(self, d: 'gnes_pb2.Document'): + self._model.apply(d) + if not d.chunks: + self.logger.warning('document (doc_id=%s) contains no chunks!' % d.doc_id)