From 4e1e53fa82a60fe81eda6a5f94bff4cc155f67c8 Mon Sep 17 00:00:00 2001 From: Jem Date: Wed, 21 Aug 2019 14:41:46 +0800 Subject: [PATCH] feat(indexer): add preprocessor and lvdb for storing gif --- gnes/indexer/fulltext/filesys.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/gnes/indexer/fulltext/filesys.py b/gnes/indexer/fulltext/filesys.py index e575e4fa..c9e93f3b 100644 --- a/gnes/indexer/fulltext/filesys.py +++ b/gnes/indexer/fulltext/filesys.py @@ -27,9 +27,11 @@ class DirectoryIndexer(BaseTextIndexer): def __init__(self, data_path: str, keep_na_doc: bool = True, + file_suffix: str = 'gif', *args, **kwargs): super().__init__(*args, **kwargs) self.data_path = data_path + self.file_suffix = file_suffix self.keep_na_doc = keep_na_doc self._NOT_FOUND = None @@ -44,15 +46,15 @@ def add(self, keys: List[int], docs: List['gnes_pb2.Document'], *args, **kwargs) dirs = os.path.join(self.data_path, str(k)) if not os.path.exists(dirs): os.makedirs(dirs) - file_type = self._get_file_type(d.doc_type) + self.file_suffix = self._get_file_type(d.doc_type) for i, chunk in enumerate(d.chunks): - with open(os.path.join(dirs, str(i)+file_type), 'wb') as f: + with open(os.path.join(dirs, str(i)+self.file_suffix), 'wb') as f: f.write(chunk.raw) def query(self, keys: List[int], *args, **kwargs) -> List['gnes_pb2.Document']: """ :param keys: list of doc id - :return: list of documents whose chunks contain all the GIFs of this doc + :return: list of documents whose chunks field contain all the GIFs of this doc(one GIF per chunk) """ res = [] for k in keys: