diff --git a/gnes/indexer/__init__.py b/gnes/indexer/__init__.py index d113032f..46275eab 100644 --- a/gnes/indexer/__init__.py +++ b/gnes/indexer/__init__.py @@ -27,7 +27,8 @@ 'JointIndexer': 'base', 'BaseIndexer': 'base', 'BaseTextIndexer': 'base', - 'AnnoyIndexer': 'vector.annoy' + 'AnnoyIndexer': 'vector.annoy', + 'DirectoryIndexer': 'fulltext.filesys' } register_all_class(_cls2file_map, 'indexer') diff --git a/gnes/indexer/fulltext/filesys.py b/gnes/indexer/fulltext/filesys.py new file mode 100644 index 00000000..f5cb2d18 --- /dev/null +++ b/gnes/indexer/fulltext/filesys.py @@ -0,0 +1,75 @@ +# Tencent is pleased to support the open source community by making GNES available. +# +# Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + +import os +from typing import List + +from ..base import BaseTextIndexer +from ...proto import gnes_pb2 + + +class DirectoryIndexer(BaseTextIndexer): + + def __init__(self, data_path: str, + keep_na_doc: bool = True, + file_suffix: str = 'gif', + *args, **kwargs): + super().__init__(*args, **kwargs) + self.data_path = data_path + self.file_suffix = file_suffix + self.keep_na_doc = keep_na_doc + self._NOT_FOUND = None + + def add(self, keys: List[int], docs: List['gnes_pb2.Document'], *args, **kwargs): + """ + write GIFs of each document into disk + folder structure: /data_path/doc_id/0.gif, 1.gif... + :param keys: list of doc id + :param docs: list of docs + """ + for k, d in zip(keys, docs): + dirs = os.path.join(self.data_path, str(k)) + if not os.path.exists(dirs): + os.makedirs(dirs) + for i, chunk in enumerate(d.chunks): + with open(os.path.join(dirs, '%d.%s' % (i, self.file_suffix)), 'wb') as f: + f.write(chunk.raw) + + def query(self, keys: List[int], *args, **kwargs) -> List['gnes_pb2.Document']: + """ + :param keys: list of doc id + :return: list of documents whose chunks field contain all the GIFs of this doc(one GIF per chunk) + """ + res = [] + for k in keys: + doc = gnes_pb2.Document() + target_dirs = os.path.join(self.data_path, str(k)) + if not os.path.exists(target_dirs): + if self.keep_na_doc: + res.append(self._NOT_FOUND) + else: + for raw_file in os.listdir(target_dirs): + if not os.path.isdir(raw_file): + c = doc.chunks.add() + c.doc_id = k + with open(os.path.join(target_dirs, raw_file), 'rb') as raw: + c.raw = raw.read() + res.append(doc) + return res + + diff --git a/gnes/preprocessor/__init__.py b/gnes/preprocessor/__init__.py index 75de2019..c3a75c8c 100644 --- a/gnes/preprocessor/__init__.py +++ b/gnes/preprocessor/__init__.py @@ -34,6 +34,7 @@ 'ShotDetectPreprocessor': 'video.shotdetect', 'AudioVanilla': 'audio.audio_vanilla', 'BaseAudioPreprocessor': 'base' + 'RawChunkPreprocessor': 'base' } register_all_class(_cls2file_map, 'preprocessor') diff --git a/gnes/preprocessor/base.py b/gnes/preprocessor/base.py index c7479791..875ab9b9 100644 --- a/gnes/preprocessor/base.py +++ b/gnes/preprocessor/base.py @@ -19,7 +19,8 @@ import numpy as np from ..base import TrainableBase, CompositionalTrainableBase -from ..proto import gnes_pb2, array2blob +from ..proto import gnes_pb2, array2blob, blob2array +from .helper import get_gif class BasePreprocessor(TrainableBase): @@ -99,3 +100,27 @@ def raw_to_chunk(self, chunk: 'gnes_pb2.Chunk', raw_bytes: bytes): raise NotImplementedError else: raise NotImplementedError + + +class RawChunkPreprocessor(BasePreprocessor): + + @staticmethod + def _parse_chunk(chunk: 'gnes_pb2.Chunk', doc_type, *args, **kwargs): + if doc_type == gnes_pb2.Document.TEXT: + raise NotImplementedError + elif doc_type == gnes_pb2.Document.IMAGE: + raise NotImplementedError + elif doc_type == gnes_pb2.Document.VIDEO: + return get_gif(blob2array(chunk.blob)) + elif doc_type == gnes_pb2.Document.AUDIO: + raise NotImplementedError + else: + raise ValueError("doc type can only be TEXT, IMAGE, VIDEO or AUDIO!") + + def apply(self, doc: 'gnes_pb2.Document') -> None: + + if doc.raw_bytes: + for chunk in doc.chunks: + chunk.raw = self._parse_chunk(chunk, doc.doc_type) + else: + self.logger.error('bad document: "raw_bytes" is empty!') diff --git a/gnes/preprocessor/helper.py b/gnes/preprocessor/helper.py index c0670f6d..a91c8796 100644 --- a/gnes/preprocessor/helper.py +++ b/gnes/preprocessor/helper.py @@ -194,6 +194,25 @@ def split_video_frames(buffer_data: bytes, return [np.array(Image.open(io.BytesIO(chunk))) for chunk in chunks] +def get_gif(images, fps=4): + cmd = ['ffmpeg', '-y', + '-f', 'rawvideo', + '-vcodec', 'rawvideo', + '-r', '%.02f' % fps, + '-s', '%dx%d' % (images[0].shape[1], images[0].shape[0]), + '-pix_fmt', 'rgb24', + '-i', '-', + '-filter_complex', '[0:v]split[x][z];[z]palettegen[y];[x]fifo[x];[x][y]paletteuse', + '-r', '%.02f' % fps, + '-f', 'gif', + '-'] + with sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE, bufsize=-1, shell=False) as pipe: + for image in images: + pipe.stdin.write(image.tostring()) + out, _ = pipe.communicate() + return out + + def block_descriptor(image: 'np.ndarray', descriptor_fn: Callable, num_blocks: int = 3) -> 'np.ndarray': diff --git a/gnes/proto/gnes.proto b/gnes/proto/gnes.proto index 13c2f80c..14c8a196 100644 --- a/gnes/proto/gnes.proto +++ b/gnes/proto/gnes.proto @@ -25,6 +25,9 @@ message Chunk { // the original ndarry of the chunk (apply to image/video documents) NdArray blob = 3; + + // raw bytes of chunk; + bytes raw = 7; } message Coordinate { diff --git a/gnes/proto/gnes_pb2.py b/gnes/proto/gnes_pb2.py index fa6f4dd2..474bc5c7 100644 --- a/gnes/proto/gnes_pb2.py +++ b/gnes/proto/gnes_pb2.py @@ -21,7 +21,7 @@ package='gnes', syntax='proto3', serialized_options=None, - serialized_pb=_b('\n\ngnes.proto\x12\x04gnes\x1a\x1fgoogle/protobuf/timestamp.proto\"9\n\x07NdArray\x12\x0c\n\x04\x64\x61ta\x18\x01 \x01(\x0c\x12\x11\n\x05shape\x18\x02 \x03(\rB\x02\x10\x01\x12\r\n\x05\x64type\x18\x03 \x01(\t\"\xbc\x01\n\x05\x43hunk\x12\x0e\n\x06\x64oc_id\x18\x01 \x01(\x04\x12\x0e\n\x04text\x18\x02 \x01(\tH\x00\x12\x1d\n\x04\x62lob\x18\x03 \x01(\x0b\x32\r.gnes.NdArrayH\x00\x12\x11\n\toffset_1d\x18\x04 \x01(\r\x12)\n\toffset_nd\x18\x05 \x01(\x0b\x32\x16.gnes.Chunk.Coordinate\x12\x0e\n\x06weight\x18\x06 \x01(\x02\x1a\x1b\n\nCoordinate\x12\r\n\x01x\x18\x01 \x03(\rB\x02\x10\x01\x42\t\n\x07\x63ontent\"\xed\x02\n\x08\x44ocument\x12\x0e\n\x06\x64oc_id\x18\x01 \x01(\x04\x12\x1b\n\x06\x63hunks\x18\x02 \x03(\x0b\x32\x0b.gnes.Chunk\x12\'\n\x10\x63hunk_embeddings\x18\x03 \x01(\x0b\x32\r.gnes.NdArray\x12(\n\x08\x64oc_type\x18\x04 \x01(\x0e\x32\x16.gnes.Document.DocType\x12\x11\n\tmeta_info\x18\x05 \x01(\x0c\x12\x12\n\x08raw_text\x18\x06 \x01(\tH\x00\x12\"\n\traw_image\x18\x07 \x01(\x0b\x32\r.gnes.NdArrayH\x00\x12\"\n\traw_video\x18\x08 \x01(\x0b\x32\r.gnes.NdArrayH\x00\x12\x13\n\traw_bytes\x18\t \x01(\x0cH\x00\x12\x0e\n\x06weight\x18\n \x01(\x02\"A\n\x07\x44ocType\x12\x0b\n\x07UNKNOWN\x10\x00\x12\x08\n\x04TEXT\x10\x01\x12\t\n\x05IMAGE\x10\x02\x12\t\n\x05VIDEO\x10\x03\x12\t\n\x05\x41UDIO\x10\x04\x42\n\n\x08raw_data\"\xd4\x01\n\x08\x45nvelope\x12\x11\n\tclient_id\x18\x01 \x01(\t\x12\x12\n\nrequest_id\x18\x02 \x01(\t\x12\x0f\n\x07part_id\x18\x03 \x01(\r\x12\x10\n\x08num_part\x18\x04 \x03(\r\x12\x0f\n\x07timeout\x18\x05 \x01(\r\x12$\n\x06routes\x18\x06 \x03(\x0b\x32\x14.gnes.Envelope.route\x1aG\n\x05route\x12\x0f\n\x07service\x18\x01 \x01(\t\x12-\n\ttimestamp\x18\x02 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\"y\n\x07Message\x12 \n\x08\x65nvelope\x18\x01 \x01(\x0b\x32\x0e.gnes.Envelope\x12 \n\x07request\x18\x02 \x01(\x0b\x32\r.gnes.RequestH\x00\x12\"\n\x08response\x18\x03 \x01(\x0b\x32\x0e.gnes.ResponseH\x00\x42\x06\n\x04\x62ody\"\xf6\x03\n\x07Request\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12+\n\x05train\x18\x02 \x01(\x0b\x32\x1a.gnes.Request.TrainRequestH\x00\x12+\n\x05index\x18\x03 \x01(\x0b\x32\x1a.gnes.Request.IndexRequestH\x00\x12,\n\x06search\x18\x04 \x01(\x0b\x32\x1a.gnes.Request.QueryRequestH\x00\x12/\n\x07\x63ontrol\x18\x05 \x01(\x0b\x32\x1c.gnes.Request.ControlRequestH\x00\x1a;\n\x0cTrainRequest\x12\x1c\n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x0e.gnes.Document\x12\r\n\x05\x66lush\x18\x02 \x01(\x08\x1a,\n\x0cIndexRequest\x12\x1c\n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x0e.gnes.Document\x1a<\n\x0cQueryRequest\x12\x1d\n\x05query\x18\x01 \x01(\x0b\x32\x0e.gnes.Document\x12\r\n\x05top_k\x18\x02 \x01(\r\x1am\n\x0e\x43ontrolRequest\x12\x35\n\x07\x63ommand\x18\x01 \x01(\x0e\x32$.gnes.Request.ControlRequest.Command\"$\n\x07\x43ommand\x12\r\n\tTERMINATE\x10\x00\x12\n\n\x06STATUS\x10\x01\x42\x06\n\x04\x62ody\"\xc4\x06\n\x08Response\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12-\n\x05train\x18\x02 \x01(\x0b\x32\x1c.gnes.Response.TrainResponseH\x00\x12-\n\x05index\x18\x03 \x01(\x0b\x32\x1c.gnes.Response.IndexResponseH\x00\x12.\n\x06search\x18\x04 \x01(\x0b\x32\x1c.gnes.Response.QueryResponseH\x00\x12\x31\n\x07\x63ontrol\x18\x05 \x01(\x0b\x32\x1e.gnes.Response.ControlResponseH\x00\x1a\x36\n\rTrainResponse\x12%\n\x06status\x18\x01 \x01(\x0e\x32\x15.gnes.Response.Status\x1a\x36\n\rIndexResponse\x12%\n\x06status\x18\x01 \x01(\x0e\x32\x15.gnes.Response.Status\x1a\x38\n\x0f\x43ontrolResponse\x12%\n\x06status\x18\x01 \x01(\x0e\x32\x15.gnes.Response.Status\x1a\x81\x03\n\rQueryResponse\x12%\n\x06status\x18\x01 \x01(\x0e\x32\x15.gnes.Response.Status\x12\r\n\x05top_k\x18\x02 \x01(\r\x12?\n\x0ctopk_results\x18\x03 \x03(\x0b\x32).gnes.Response.QueryResponse.ScoredResult\x12\x39\n\x05level\x18\x04 \x01(\x0e\x32*.gnes.Response.QueryResponse.ResponseLevel\x1a{\n\x0cScoredResult\x12\x1c\n\x05\x63hunk\x18\x01 \x01(\x0b\x32\x0b.gnes.ChunkH\x00\x12\x1d\n\x03\x64oc\x18\x02 \x01(\x0b\x32\x0e.gnes.DocumentH\x00\x12\r\n\x05score\x18\x03 \x01(\x02\x12\x17\n\x0fscore_explained\x18\x04 \x01(\tB\x06\n\x04\x62ody\"A\n\rResponseLevel\x12\t\n\x05\x43HUNK\x10\x00\x12\x17\n\x13\x44OCUMENT_NOT_FILLED\x10\x01\x12\x0c\n\x08\x44OCUMENT\x10\x02\"-\n\x06Status\x12\x0b\n\x07SUCCESS\x10\x00\x12\t\n\x05\x45RROR\x10\x01\x12\x0b\n\x07PENDING\x10\x02\x42\x06\n\x04\x62ody2\xe3\x01\n\x07GnesRPC\x12(\n\x05Train\x12\r.gnes.Request\x1a\x0e.gnes.Response\"\x00\x12(\n\x05Index\x12\r.gnes.Request\x1a\x0e.gnes.Response\"\x00\x12(\n\x05Query\x12\r.gnes.Request\x1a\x0e.gnes.Response\"\x00\x12\'\n\x04\x43\x61ll\x12\r.gnes.Request\x1a\x0e.gnes.Response\"\x00\x12\x31\n\nStreamCall\x12\r.gnes.Request\x1a\x0e.gnes.Response\"\x00(\x01\x30\x01\x62\x06proto3') + serialized_pb=_b('\n\ngnes.proto\x12\x04gnes\x1a\x1fgoogle/protobuf/timestamp.proto\"9\n\x07NdArray\x12\x0c\n\x04\x64\x61ta\x18\x01 \x01(\x0c\x12\x11\n\x05shape\x18\x02 \x03(\rB\x02\x10\x01\x12\r\n\x05\x64type\x18\x03 \x01(\t\"\xcb\x01\n\x05\x43hunk\x12\x0e\n\x06\x64oc_id\x18\x01 \x01(\x04\x12\x0e\n\x04text\x18\x02 \x01(\tH\x00\x12\x1d\n\x04\x62lob\x18\x03 \x01(\x0b\x32\r.gnes.NdArrayH\x00\x12\r\n\x03raw\x18\x07 \x01(\x0cH\x00\x12\x11\n\toffset_1d\x18\x04 \x01(\r\x12)\n\toffset_nd\x18\x05 \x01(\x0b\x32\x16.gnes.Chunk.Coordinate\x12\x0e\n\x06weight\x18\x06 \x01(\x02\x1a\x1b\n\nCoordinate\x12\r\n\x01x\x18\x01 \x03(\rB\x02\x10\x01\x42\t\n\x07\x63ontent\"\xed\x02\n\x08\x44ocument\x12\x0e\n\x06\x64oc_id\x18\x01 \x01(\x04\x12\x1b\n\x06\x63hunks\x18\x02 \x03(\x0b\x32\x0b.gnes.Chunk\x12\'\n\x10\x63hunk_embeddings\x18\x03 \x01(\x0b\x32\r.gnes.NdArray\x12(\n\x08\x64oc_type\x18\x04 \x01(\x0e\x32\x16.gnes.Document.DocType\x12\x11\n\tmeta_info\x18\x05 \x01(\x0c\x12\x12\n\x08raw_text\x18\x06 \x01(\tH\x00\x12\"\n\traw_image\x18\x07 \x01(\x0b\x32\r.gnes.NdArrayH\x00\x12\"\n\traw_video\x18\x08 \x01(\x0b\x32\r.gnes.NdArrayH\x00\x12\x13\n\traw_bytes\x18\t \x01(\x0cH\x00\x12\x0e\n\x06weight\x18\n \x01(\x02\"A\n\x07\x44ocType\x12\x0b\n\x07UNKNOWN\x10\x00\x12\x08\n\x04TEXT\x10\x01\x12\t\n\x05IMAGE\x10\x02\x12\t\n\x05VIDEO\x10\x03\x12\t\n\x05\x41UDIO\x10\x04\x42\n\n\x08raw_data\"\xd4\x01\n\x08\x45nvelope\x12\x11\n\tclient_id\x18\x01 \x01(\t\x12\x12\n\nrequest_id\x18\x02 \x01(\t\x12\x0f\n\x07part_id\x18\x03 \x01(\r\x12\x10\n\x08num_part\x18\x04 \x03(\r\x12\x0f\n\x07timeout\x18\x05 \x01(\r\x12$\n\x06routes\x18\x06 \x03(\x0b\x32\x14.gnes.Envelope.route\x1aG\n\x05route\x12\x0f\n\x07service\x18\x01 \x01(\t\x12-\n\ttimestamp\x18\x02 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\"y\n\x07Message\x12 \n\x08\x65nvelope\x18\x01 \x01(\x0b\x32\x0e.gnes.Envelope\x12 \n\x07request\x18\x02 \x01(\x0b\x32\r.gnes.RequestH\x00\x12\"\n\x08response\x18\x03 \x01(\x0b\x32\x0e.gnes.ResponseH\x00\x42\x06\n\x04\x62ody\"\xf6\x03\n\x07Request\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12+\n\x05train\x18\x02 \x01(\x0b\x32\x1a.gnes.Request.TrainRequestH\x00\x12+\n\x05index\x18\x03 \x01(\x0b\x32\x1a.gnes.Request.IndexRequestH\x00\x12,\n\x06search\x18\x04 \x01(\x0b\x32\x1a.gnes.Request.QueryRequestH\x00\x12/\n\x07\x63ontrol\x18\x05 \x01(\x0b\x32\x1c.gnes.Request.ControlRequestH\x00\x1a;\n\x0cTrainRequest\x12\x1c\n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x0e.gnes.Document\x12\r\n\x05\x66lush\x18\x02 \x01(\x08\x1a,\n\x0cIndexRequest\x12\x1c\n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x0e.gnes.Document\x1a<\n\x0cQueryRequest\x12\x1d\n\x05query\x18\x01 \x01(\x0b\x32\x0e.gnes.Document\x12\r\n\x05top_k\x18\x02 \x01(\r\x1am\n\x0e\x43ontrolRequest\x12\x35\n\x07\x63ommand\x18\x01 \x01(\x0e\x32$.gnes.Request.ControlRequest.Command\"$\n\x07\x43ommand\x12\r\n\tTERMINATE\x10\x00\x12\n\n\x06STATUS\x10\x01\x42\x06\n\x04\x62ody\"\xc4\x06\n\x08Response\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12-\n\x05train\x18\x02 \x01(\x0b\x32\x1c.gnes.Response.TrainResponseH\x00\x12-\n\x05index\x18\x03 \x01(\x0b\x32\x1c.gnes.Response.IndexResponseH\x00\x12.\n\x06search\x18\x04 \x01(\x0b\x32\x1c.gnes.Response.QueryResponseH\x00\x12\x31\n\x07\x63ontrol\x18\x05 \x01(\x0b\x32\x1e.gnes.Response.ControlResponseH\x00\x1a\x36\n\rTrainResponse\x12%\n\x06status\x18\x01 \x01(\x0e\x32\x15.gnes.Response.Status\x1a\x36\n\rIndexResponse\x12%\n\x06status\x18\x01 \x01(\x0e\x32\x15.gnes.Response.Status\x1a\x38\n\x0f\x43ontrolResponse\x12%\n\x06status\x18\x01 \x01(\x0e\x32\x15.gnes.Response.Status\x1a\x81\x03\n\rQueryResponse\x12%\n\x06status\x18\x01 \x01(\x0e\x32\x15.gnes.Response.Status\x12\r\n\x05top_k\x18\x02 \x01(\r\x12?\n\x0ctopk_results\x18\x03 \x03(\x0b\x32).gnes.Response.QueryResponse.ScoredResult\x12\x39\n\x05level\x18\x04 \x01(\x0e\x32*.gnes.Response.QueryResponse.ResponseLevel\x1a{\n\x0cScoredResult\x12\x1c\n\x05\x63hunk\x18\x01 \x01(\x0b\x32\x0b.gnes.ChunkH\x00\x12\x1d\n\x03\x64oc\x18\x02 \x01(\x0b\x32\x0e.gnes.DocumentH\x00\x12\r\n\x05score\x18\x03 \x01(\x02\x12\x17\n\x0fscore_explained\x18\x04 \x01(\tB\x06\n\x04\x62ody\"A\n\rResponseLevel\x12\t\n\x05\x43HUNK\x10\x00\x12\x17\n\x13\x44OCUMENT_NOT_FILLED\x10\x01\x12\x0c\n\x08\x44OCUMENT\x10\x02\"-\n\x06Status\x12\x0b\n\x07SUCCESS\x10\x00\x12\t\n\x05\x45RROR\x10\x01\x12\x0b\n\x07PENDING\x10\x02\x42\x06\n\x04\x62ody2\xe3\x01\n\x07GnesRPC\x12(\n\x05Train\x12\r.gnes.Request\x1a\x0e.gnes.Response\"\x00\x12(\n\x05Index\x12\r.gnes.Request\x1a\x0e.gnes.Response\"\x00\x12(\n\x05Query\x12\r.gnes.Request\x1a\x0e.gnes.Response\"\x00\x12\'\n\x04\x43\x61ll\x12\r.gnes.Request\x1a\x0e.gnes.Response\"\x00\x12\x31\n\nStreamCall\x12\r.gnes.Request\x1a\x0e.gnes.Response\"\x00(\x01\x30\x01\x62\x06proto3') , dependencies=[google_dot_protobuf_dot_timestamp__pb2.DESCRIPTOR,]) @@ -56,8 +56,8 @@ ], containing_type=None, serialized_options=None, - serialized_start=592, - serialized_end=657, + serialized_start=607, + serialized_end=672, ) _sym_db.RegisterEnumDescriptor(_DOCUMENT_DOCTYPE) @@ -78,8 +78,8 @@ ], containing_type=None, serialized_options=None, - serialized_start=1468, - serialized_end=1504, + serialized_start=1483, + serialized_end=1519, ) _sym_db.RegisterEnumDescriptor(_REQUEST_CONTROLREQUEST_COMMAND) @@ -104,8 +104,8 @@ ], containing_type=None, serialized_options=None, - serialized_start=2231, - serialized_end=2296, + serialized_start=2246, + serialized_end=2311, ) _sym_db.RegisterEnumDescriptor(_RESPONSE_QUERYRESPONSE_RESPONSELEVEL) @@ -130,8 +130,8 @@ ], containing_type=None, serialized_options=None, - serialized_start=2298, - serialized_end=2343, + serialized_start=2313, + serialized_end=2358, ) _sym_db.RegisterEnumDescriptor(_RESPONSE_STATUS) @@ -207,8 +207,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=263, - serialized_end=290, + serialized_start=278, + serialized_end=305, ) _CHUNK = _descriptor.Descriptor( @@ -240,21 +240,28 @@ is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( - name='offset_1d', full_name='gnes.Chunk.offset_1d', index=3, + name='raw', full_name='gnes.Chunk.raw', index=3, + number=7, type=12, cpp_type=9, label=1, + has_default_value=False, default_value=_b(""), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='offset_1d', full_name='gnes.Chunk.offset_1d', index=4, number=4, type=13, cpp_type=3, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( - name='offset_nd', full_name='gnes.Chunk.offset_nd', index=4, + name='offset_nd', full_name='gnes.Chunk.offset_nd', index=5, number=5, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( - name='weight', full_name='gnes.Chunk.weight', index=5, + name='weight', full_name='gnes.Chunk.weight', index=6, number=6, type=2, cpp_type=6, label=1, has_default_value=False, default_value=float(0), message_type=None, enum_type=None, containing_type=None, @@ -276,7 +283,7 @@ index=0, containing_type=None, fields=[]), ], serialized_start=113, - serialized_end=301, + serialized_end=316, ) @@ -373,8 +380,8 @@ name='raw_data', full_name='gnes.Document.raw_data', index=0, containing_type=None, fields=[]), ], - serialized_start=304, - serialized_end=669, + serialized_start=319, + serialized_end=684, ) @@ -411,8 +418,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=813, - serialized_end=884, + serialized_start=828, + serialized_end=899, ) _ENVELOPE = _descriptor.Descriptor( @@ -476,8 +483,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=672, - serialized_end=884, + serialized_start=687, + serialized_end=899, ) @@ -524,8 +531,8 @@ name='body', full_name='gnes.Message.body', index=0, containing_type=None, fields=[]), ], - serialized_start=886, - serialized_end=1007, + serialized_start=901, + serialized_end=1022, ) @@ -562,8 +569,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1226, - serialized_end=1285, + serialized_start=1241, + serialized_end=1300, ) _REQUEST_INDEXREQUEST = _descriptor.Descriptor( @@ -592,8 +599,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1287, - serialized_end=1331, + serialized_start=1302, + serialized_end=1346, ) _REQUEST_QUERYREQUEST = _descriptor.Descriptor( @@ -629,8 +636,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1333, - serialized_end=1393, + serialized_start=1348, + serialized_end=1408, ) _REQUEST_CONTROLREQUEST = _descriptor.Descriptor( @@ -660,8 +667,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1395, - serialized_end=1504, + serialized_start=1410, + serialized_end=1519, ) _REQUEST = _descriptor.Descriptor( @@ -721,8 +728,8 @@ name='body', full_name='gnes.Request.body', index=0, containing_type=None, fields=[]), ], - serialized_start=1010, - serialized_end=1512, + serialized_start=1025, + serialized_end=1527, ) @@ -752,8 +759,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1740, - serialized_end=1794, + serialized_start=1755, + serialized_end=1809, ) _RESPONSE_INDEXRESPONSE = _descriptor.Descriptor( @@ -782,8 +789,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1796, - serialized_end=1850, + serialized_start=1811, + serialized_end=1865, ) _RESPONSE_CONTROLRESPONSE = _descriptor.Descriptor( @@ -812,8 +819,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1852, - serialized_end=1908, + serialized_start=1867, + serialized_end=1923, ) _RESPONSE_QUERYRESPONSE_SCOREDRESULT = _descriptor.Descriptor( @@ -866,8 +873,8 @@ name='body', full_name='gnes.Response.QueryResponse.ScoredResult.body', index=0, containing_type=None, fields=[]), ], - serialized_start=2106, - serialized_end=2229, + serialized_start=2121, + serialized_end=2244, ) _RESPONSE_QUERYRESPONSE = _descriptor.Descriptor( @@ -918,8 +925,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1911, - serialized_end=2296, + serialized_start=1926, + serialized_end=2311, ) _RESPONSE = _descriptor.Descriptor( @@ -980,8 +987,8 @@ name='body', full_name='gnes.Response.body', index=0, containing_type=None, fields=[]), ], - serialized_start=1515, - serialized_end=2351, + serialized_start=1530, + serialized_end=2366, ) _CHUNK_COORDINATE.containing_type = _CHUNK @@ -993,6 +1000,9 @@ _CHUNK.oneofs_by_name['content'].fields.append( _CHUNK.fields_by_name['blob']) _CHUNK.fields_by_name['blob'].containing_oneof = _CHUNK.oneofs_by_name['content'] +_CHUNK.oneofs_by_name['content'].fields.append( + _CHUNK.fields_by_name['raw']) +_CHUNK.fields_by_name['raw'].containing_oneof = _CHUNK.oneofs_by_name['content'] _DOCUMENT.fields_by_name['chunks'].message_type = _CHUNK _DOCUMENT.fields_by_name['chunk_embeddings'].message_type = _NDARRAY _DOCUMENT.fields_by_name['doc_type'].enum_type = _DOCUMENT_DOCTYPE @@ -1241,8 +1251,8 @@ file=DESCRIPTOR, index=0, serialized_options=None, - serialized_start=2354, - serialized_end=2581, + serialized_start=2369, + serialized_end=2596, methods=[ _descriptor.MethodDescriptor( name='Train', diff --git a/gnes/service/indexer.py b/gnes/service/indexer.py index e33f72ad..cb3226b6 100644 --- a/gnes/service/indexer.py +++ b/gnes/service/indexer.py @@ -29,27 +29,27 @@ def post_init(self): @handler.register(gnes_pb2.Request.IndexRequest) def _handler_index(self, msg: 'gnes_pb2.Message'): - all_vecs = [] - doc_ids = [] - offsets = [] - weights = [] - - for d in msg.request.index.docs: - if len(d.chunks): - all_vecs.append(blob2array(d.chunk_embeddings)) - doc_ids += [d.doc_id] * len(d.chunks) - if d.doc_type == gnes_pb2.Document.TEXT: - offsets += [c.offset_1d for c in d.chunks] - elif d.doc_type == gnes_pb2.Document.IMAGE: - offsets += [c.offset_nd for c in d.chunks] - elif d.doc_type == gnes_pb2.Document.VIDEO: - offsets += [c.offset_1d for c in d.chunks] - elif d.doc_type == gnes_pb2.Document.AUDIO: - offsets += [c.offset_1d for c in d.chunks] - weights += [c.weight for c in d.chunks] - from ..indexer.base import BaseVectorIndexer, BaseTextIndexer if isinstance(self._model, BaseVectorIndexer): + all_vecs = [] + doc_ids = [] + offsets = [] + weights = [] + + for d in msg.request.index.docs: + if len(d.chunks): + all_vecs.append(blob2array(d.chunk_embeddings)) + doc_ids += [d.doc_id] * len(d.chunks) + if d.doc_type == gnes_pb2.Document.TEXT: + offsets += [c.offset_1d for c in d.chunks] + elif d.doc_type == gnes_pb2.Document.IMAGE: + offsets += [c.offset_nd for c in d.chunks] + elif d.doc_type == gnes_pb2.Document.VIDEO: + offsets += [c.offset_1d for c in d.chunks] + elif d.doc_type == gnes_pb2.Document.AUDIO: + offsets += [c.offset_1d for c in d.chunks] + weights += [c.weight for c in d.chunks] + self._model.add(list(zip(doc_ids, offsets)), np.concatenate(all_vecs, 0), weights) diff --git a/tests/test_dict_indexer.py b/tests/test_dict_indexer.py new file mode 100644 index 00000000..851a4d68 --- /dev/null +++ b/tests/test_dict_indexer.py @@ -0,0 +1,52 @@ +import os +import unittest +from shutil import rmtree + +from gnes.indexer.fulltext.filesys import DirectoryIndexer +from gnes.preprocessor.base import BasePreprocessor +from gnes.proto import gnes_pb2 + + +class TestDictIndexer(unittest.TestCase): + def setUp(self): + self.dirname = os.path.dirname(__file__) + + self.video_path = os.path.join(self.dirname, 'videos') + self.video_bytes = [open(os.path.join(self.video_path, _), 'rb').read() + for _ in os.listdir(self.video_path)] + + self.pipeline_name = 'pipe-gif' + self.pipeline_yml_path = os.path.join(self.dirname, 'yaml/%s.yml' % self.pipeline_name) + self.data_path = './test_chunkleveldb' + self.dump_path = os.path.join(self.dirname, 'indexer.bin') + + def tearDown(self): + if os.path.exists(self.data_path): + rmtree(self.data_path) + + def init_db(self): + self.db = DirectoryIndexer(self.data_path) + + self.d = gnes_pb2.Document() + self.d.doc_id = 0 + self.d.raw_bytes = self.video_bytes[0] + + preprocess = BasePreprocessor.load_yaml(self.pipeline_yml_path) + preprocess.apply(self.d) + + self.db.add(list(range(len(self.video_bytes))), [self.d]) + + def test_add_docs(self): + self.init_db() + self.assertTrue(os.path.exists(os.path.join(self.data_path, str(self.d.doc_id)))) + self.assertEqual(len(self.d.chunks), len(os.listdir(os.path.join(self.data_path, str(self.d.doc_id))))) + + def test_query_docs(self): + self.init_db() + + query_list = [0, 1, 2] + res = self.db.query(query_list) + num_non_empty = sum(1 for d in res if d) + self.assertEqual(num_non_empty, 1) + + diff --git a/tests/test_gif.py b/tests/test_gif.py new file mode 100644 index 00000000..9617be61 --- /dev/null +++ b/tests/test_gif.py @@ -0,0 +1,31 @@ +import os +import unittest + +from gnes.preprocessor.base import BasePreprocessor +from gnes.preprocessor.video.ffmpeg import FFmpegVideoSegmentor +from gnes.proto import gnes_pb2 +import copy + + +class TestPartition(unittest.TestCase): + def setUp(self): + self.dirname = os.path.dirname(__file__) + self.p3_name = 'pipe-gif' + self.pipeline_path = os.path.join(self.dirname, 'yaml/%s.yml' % self.p3_name) + self.ffmpeg_yaml_path = os.path.join(self.dirname, 'yaml/preprocessor-ffmpeg2.yml') + self.video_path = os.path.join(self.dirname, 'videos') + self.video_bytes = [open(os.path.join(self.video_path, _), 'rb').read() + for _ in os.listdir(self.video_path)] + + def test_gif_pipelinepreproces(self): + d = gnes_pb2.Document() + d.raw_bytes = self.video_bytes[0] + d_ = copy.deepcopy(d) + + p3 = FFmpegVideoSegmentor.load_yaml(self.ffmpeg_yaml_path) + p3.apply(d) + + p4 = BasePreprocessor.load_yaml(self.pipeline_path) + p4.apply(d_) + + self.assertEqual(len(d.chunks), len(d_.chunks)) diff --git a/tests/yaml/pipe-gif.yml b/tests/yaml/pipe-gif.yml new file mode 100644 index 00000000..4b1d292c --- /dev/null +++ b/tests/yaml/pipe-gif.yml @@ -0,0 +1,18 @@ +!PipelinePreprocessor +components: + - !FFmpegVideoSegmentor + parameters: + segment_method: cut_by_frame + segment_interval: 30 + s: "192*168" + r: 1 + gnes_config: + name: FFmpegVideoSegmentor + is_trained: true + - !RawChunkPreprocessor + gnes_config: + name: RawChunkPreprocessor + is_trained: true +gnes_config: + name: PipelinePreprocessor + work_dir: /save \ No newline at end of file