Merge branch 'master' into fix-import-component

gnes-ai · Aug 21, 2019 · 27dc34d · 27dc34d
2 parents 5f1ca00 + be0e59a
commit 27dc34d
Show file tree

Hide file tree

Showing 11 changed files with 303 additions and 68 deletions.
diff --git a/gnes/indexer/__init__.py b/gnes/indexer/__init__.py
@@ -27,7 +27,8 @@
     'JointIndexer': 'base',
     'BaseIndexer': 'base',
     'BaseTextIndexer': 'base',
-    'AnnoyIndexer': 'vector.annoy'
+    'AnnoyIndexer': 'vector.annoy',
+    'DirectoryIndexer': 'fulltext.filesys'
 }
 
 register_all_class(_cls2file_map, 'indexer')
diff --git a/gnes/indexer/fulltext/filesys.py b/gnes/indexer/fulltext/filesys.py
@@ -0,0 +1,75 @@
+#  Tencent is pleased to support the open source community by making GNES available.
+#
+#  Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+
+
+import os
+from typing import List
+
+from ..base import BaseTextIndexer
+from ...proto import gnes_pb2
+
+
+class DirectoryIndexer(BaseTextIndexer):
+
+    def __init__(self, data_path: str,
+                 keep_na_doc: bool = True,
+                 file_suffix: str = 'gif',
+                 *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.data_path = data_path
+        self.file_suffix = file_suffix
+        self.keep_na_doc = keep_na_doc
+        self._NOT_FOUND = None
+
+    def add(self, keys: List[int], docs: List['gnes_pb2.Document'], *args, **kwargs):
+        """
+        write GIFs of each document into disk
+        folder structure: /data_path/doc_id/0.gif, 1.gif...
+        :param keys: list of doc id
+        :param docs: list of docs
+        """
+        for k, d in zip(keys, docs):
+            dirs = os.path.join(self.data_path, str(k))
+            if not os.path.exists(dirs):
+                os.makedirs(dirs)
+            for i, chunk in enumerate(d.chunks):
+                with open(os.path.join(dirs, '%d.%s' % (i, self.file_suffix)), 'wb') as f:
+                    f.write(chunk.raw)
+
+    def query(self, keys: List[int], *args, **kwargs) -> List['gnes_pb2.Document']:
+        """
+        :param keys: list of doc id
+        :return: list of documents whose chunks field contain all the GIFs of this doc(one GIF per chunk)
+        """
+        res = []
+        for k in keys:
+            doc = gnes_pb2.Document()
+            target_dirs = os.path.join(self.data_path, str(k))
+            if not os.path.exists(target_dirs):
+                if self.keep_na_doc:
+                    res.append(self._NOT_FOUND)
+            else:
+                for raw_file in os.listdir(target_dirs):
+                    if not os.path.isdir(raw_file):
+                        c = doc.chunks.add()
+                        c.doc_id = k
+                        with open(os.path.join(target_dirs, raw_file), 'rb') as raw:
+                            c.raw = raw.read()
+                res.append(doc)
+        return res
+
+
diff --git a/gnes/preprocessor/__init__.py b/gnes/preprocessor/__init__.py
@@ -34,6 +34,7 @@
     'ShotDetectPreprocessor': 'video.shotdetect',
     'AudioVanilla': 'audio.audio_vanilla',
     'BaseAudioPreprocessor': 'base'
+    'RawChunkPreprocessor': 'base'
 }
 
 register_all_class(_cls2file_map, 'preprocessor')
diff --git a/gnes/preprocessor/base.py b/gnes/preprocessor/base.py
@@ -19,7 +19,8 @@
 import numpy as np
 
 from ..base import TrainableBase, CompositionalTrainableBase
-from ..proto import gnes_pb2, array2blob
+from ..proto import gnes_pb2, array2blob, blob2array
+from .helper import get_gif
 
 
 class BasePreprocessor(TrainableBase):
@@ -99,3 +100,27 @@ def raw_to_chunk(self, chunk: 'gnes_pb2.Chunk', raw_bytes: bytes):
             raise NotImplementedError
         else:
             raise NotImplementedError
+
+
+class RawChunkPreprocessor(BasePreprocessor):
+
+    @staticmethod
+    def _parse_chunk(chunk: 'gnes_pb2.Chunk', doc_type, *args, **kwargs):
+        if doc_type == gnes_pb2.Document.TEXT:
+            raise NotImplementedError
+        elif doc_type == gnes_pb2.Document.IMAGE:
+            raise NotImplementedError
+        elif doc_type == gnes_pb2.Document.VIDEO:
+            return get_gif(blob2array(chunk.blob))
+        elif doc_type == gnes_pb2.Document.AUDIO:
+            raise NotImplementedError
+        else:
+            raise ValueError("doc type can only be TEXT, IMAGE, VIDEO or AUDIO!")
+
+    def apply(self, doc: 'gnes_pb2.Document') -> None:
+
+        if doc.raw_bytes:
+            for chunk in doc.chunks:
+                chunk.raw = self._parse_chunk(chunk, doc.doc_type)
+        else:
+            self.logger.error('bad document: "raw_bytes" is empty!')
diff --git a/gnes/preprocessor/helper.py b/gnes/preprocessor/helper.py
@@ -194,6 +194,25 @@ def split_video_frames(buffer_data: bytes,
     return [np.array(Image.open(io.BytesIO(chunk))) for chunk in chunks]
 
 
+def get_gif(images, fps=4):
+    cmd = ['ffmpeg', '-y',
+           '-f', 'rawvideo',
+           '-vcodec', 'rawvideo',
+           '-r', '%.02f' % fps,
+           '-s', '%dx%d' % (images[0].shape[1], images[0].shape[0]),
+           '-pix_fmt', 'rgb24',
+           '-i', '-',
+           '-filter_complex', '[0:v]split[x][z];[z]palettegen[y];[x]fifo[x];[x][y]paletteuse',
+           '-r', '%.02f' % fps,
+           '-f', 'gif',
+           '-']
+    with sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE, bufsize=-1, shell=False) as pipe:
+        for image in images:
+            pipe.stdin.write(image.tostring())
+        out, _ = pipe.communicate()
+    return out
+
+
 def block_descriptor(image: 'np.ndarray',
                      descriptor_fn: Callable,
                      num_blocks: int = 3) -> 'np.ndarray':

diff --git a/gnes/proto/gnes.proto b/gnes/proto/gnes.proto
@@ -25,6 +25,9 @@ message Chunk {
 
         // the original ndarry of the chunk (apply to image/video documents)
         NdArray blob = 3;
+
+        // raw bytes of chunk;
+        bytes raw = 7;
     }
 
     message Coordinate {