Skip to content
This repository has been archived by the owner on Feb 22, 2020. It is now read-only.

Commit

Permalink
Merge branch 'master' into fix-import-component
Browse files Browse the repository at this point in the history
  • Loading branch information
Han Xiao authored Aug 21, 2019
2 parents 5f1ca00 + be0e59a commit 27dc34d
Show file tree
Hide file tree
Showing 11 changed files with 303 additions and 68 deletions.
3 changes: 2 additions & 1 deletion gnes/indexer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
'JointIndexer': 'base',
'BaseIndexer': 'base',
'BaseTextIndexer': 'base',
'AnnoyIndexer': 'vector.annoy'
'AnnoyIndexer': 'vector.annoy',
'DirectoryIndexer': 'fulltext.filesys'
}

register_all_class(_cls2file_map, 'indexer')
75 changes: 75 additions & 0 deletions gnes/indexer/fulltext/filesys.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Tencent is pleased to support the open source community by making GNES available.
#
# Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.




import os
from typing import List

from ..base import BaseTextIndexer
from ...proto import gnes_pb2


class DirectoryIndexer(BaseTextIndexer):

def __init__(self, data_path: str,
keep_na_doc: bool = True,
file_suffix: str = 'gif',
*args, **kwargs):
super().__init__(*args, **kwargs)
self.data_path = data_path
self.file_suffix = file_suffix
self.keep_na_doc = keep_na_doc
self._NOT_FOUND = None

def add(self, keys: List[int], docs: List['gnes_pb2.Document'], *args, **kwargs):
"""
write GIFs of each document into disk
folder structure: /data_path/doc_id/0.gif, 1.gif...
:param keys: list of doc id
:param docs: list of docs
"""
for k, d in zip(keys, docs):
dirs = os.path.join(self.data_path, str(k))
if not os.path.exists(dirs):
os.makedirs(dirs)
for i, chunk in enumerate(d.chunks):
with open(os.path.join(dirs, '%d.%s' % (i, self.file_suffix)), 'wb') as f:
f.write(chunk.raw)

def query(self, keys: List[int], *args, **kwargs) -> List['gnes_pb2.Document']:
"""
:param keys: list of doc id
:return: list of documents whose chunks field contain all the GIFs of this doc(one GIF per chunk)
"""
res = []
for k in keys:
doc = gnes_pb2.Document()
target_dirs = os.path.join(self.data_path, str(k))
if not os.path.exists(target_dirs):
if self.keep_na_doc:
res.append(self._NOT_FOUND)
else:
for raw_file in os.listdir(target_dirs):
if not os.path.isdir(raw_file):
c = doc.chunks.add()
c.doc_id = k
with open(os.path.join(target_dirs, raw_file), 'rb') as raw:
c.raw = raw.read()
res.append(doc)
return res


1 change: 1 addition & 0 deletions gnes/preprocessor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
'ShotDetectPreprocessor': 'video.shotdetect',
'AudioVanilla': 'audio.audio_vanilla',
'BaseAudioPreprocessor': 'base'
'RawChunkPreprocessor': 'base'
}

register_all_class(_cls2file_map, 'preprocessor')
27 changes: 26 additions & 1 deletion gnes/preprocessor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
import numpy as np

from ..base import TrainableBase, CompositionalTrainableBase
from ..proto import gnes_pb2, array2blob
from ..proto import gnes_pb2, array2blob, blob2array
from .helper import get_gif


class BasePreprocessor(TrainableBase):
Expand Down Expand Up @@ -99,3 +100,27 @@ def raw_to_chunk(self, chunk: 'gnes_pb2.Chunk', raw_bytes: bytes):
raise NotImplementedError
else:
raise NotImplementedError


class RawChunkPreprocessor(BasePreprocessor):

@staticmethod
def _parse_chunk(chunk: 'gnes_pb2.Chunk', doc_type, *args, **kwargs):
if doc_type == gnes_pb2.Document.TEXT:
raise NotImplementedError
elif doc_type == gnes_pb2.Document.IMAGE:
raise NotImplementedError
elif doc_type == gnes_pb2.Document.VIDEO:
return get_gif(blob2array(chunk.blob))
elif doc_type == gnes_pb2.Document.AUDIO:
raise NotImplementedError
else:
raise ValueError("doc type can only be TEXT, IMAGE, VIDEO or AUDIO!")

def apply(self, doc: 'gnes_pb2.Document') -> None:

if doc.raw_bytes:
for chunk in doc.chunks:
chunk.raw = self._parse_chunk(chunk, doc.doc_type)
else:
self.logger.error('bad document: "raw_bytes" is empty!')
19 changes: 19 additions & 0 deletions gnes/preprocessor/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,25 @@ def split_video_frames(buffer_data: bytes,
return [np.array(Image.open(io.BytesIO(chunk))) for chunk in chunks]


def get_gif(images, fps=4):
cmd = ['ffmpeg', '-y',
'-f', 'rawvideo',
'-vcodec', 'rawvideo',
'-r', '%.02f' % fps,
'-s', '%dx%d' % (images[0].shape[1], images[0].shape[0]),
'-pix_fmt', 'rgb24',
'-i', '-',
'-filter_complex', '[0:v]split[x][z];[z]palettegen[y];[x]fifo[x];[x][y]paletteuse',
'-r', '%.02f' % fps,
'-f', 'gif',
'-']
with sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE, bufsize=-1, shell=False) as pipe:
for image in images:
pipe.stdin.write(image.tostring())
out, _ = pipe.communicate()
return out


def block_descriptor(image: 'np.ndarray',
descriptor_fn: Callable,
num_blocks: int = 3) -> 'np.ndarray':
Expand Down
3 changes: 3 additions & 0 deletions gnes/proto/gnes.proto
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ message Chunk {

// the original ndarry of the chunk (apply to image/video documents)
NdArray blob = 3;

// raw bytes of chunk;
bytes raw = 7;
}

message Coordinate {
Expand Down
Loading

0 comments on commit 27dc34d

Please sign in to comment.