From 50a944b65c6037dae46ed04925fbb9af6b5523ab Mon Sep 17 00:00:00 2001 From: Jem Date: Tue, 3 Sep 2019 21:33:44 +0800 Subject: [PATCH] feat(encoder): add yt8m feature extractor --- gnes/encoder/__init__.py | 3 +- gnes/encoder/video/yt8m_feature_extractor.py | 121 ++++ .../yt8m_feature_extractor_cores/__init__.py | 0 .../inception_utils.py | 82 +++ .../inception_v3.py | 561 ++++++++++++++++++ tests/__init__.py | 8 +- tests/test_yt8m_feature_extractor.py | 26 + tests/yaml/yt8m_feature.yml | 7 + 8 files changed, 805 insertions(+), 3 deletions(-) create mode 100644 gnes/encoder/video/yt8m_feature_extractor.py create mode 100644 gnes/encoder/video/yt8m_feature_extractor_cores/__init__.py create mode 100644 gnes/encoder/video/yt8m_feature_extractor_cores/inception_utils.py create mode 100644 gnes/encoder/video/yt8m_feature_extractor_cores/inception_v3.py create mode 100644 tests/test_yt8m_feature_extractor.py create mode 100644 tests/yaml/yt8m_feature.yml diff --git a/gnes/encoder/__init__.py b/gnes/encoder/__init__.py index b0aa6e87..43a67000 100644 --- a/gnes/encoder/__init__.py +++ b/gnes/encoder/__init__.py @@ -42,7 +42,8 @@ 'MfccEncoder': 'audio.mfcc', 'PoolingEncoder': 'numeric.pooling', 'PyTorchTransformers': 'text.transformer', - 'VggishEncoder': 'audio.vggish' + 'VggishEncoder': 'audio.vggish', + 'YouTube8MFeatureExtractor': 'video.yt8m_feature_extractor' } register_all_class(_cls2file_map, 'encoder') diff --git a/gnes/encoder/video/yt8m_feature_extractor.py b/gnes/encoder/video/yt8m_feature_extractor.py new file mode 100644 index 00000000..62fa2453 --- /dev/null +++ b/gnes/encoder/video/yt8m_feature_extractor.py @@ -0,0 +1,121 @@ +# Tencent is pleased to support the open source community by making GNES available. +# +# Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +import numpy as np +from PIL import Image +from ..base import BaseVideoEncoder +from ...helper import batching, get_first_available_gpu + + +class YouTube8MFeatureExtractor(BaseVideoEncoder): + """Extracts YouTube8M features for RGB frames. + + First time constructing this class will create directory `yt8m` inside your + home directory, and will download inception model (85 MB) and YouTube8M PCA + matrix (15 MB). If you want to use another directory, then pass it to argument + `model_dir` of constructor. + + If the model_dir exist and contains the necessary files, then files will be + re-used without download. + + Usage Example: + + from PIL import Image + import numpy + + # Instantiate extractor. Slow if called first time on your machine, as it + # needs to download 100 MB. + extractor = YouTube8MFeatureExtractor() + + image_file = os.path.join(extractor._model_dir, 'cropped_panda.jpg') + + im = numpy.array(Image.open(image_file)) + features = extractor.extract_rgb_frame_features(im) + + ** Note: OpenCV reverses the order of channels (i.e. orders channels as BGR + instead of RGB). If you are using OpenCV, then you must do: + + im = im[:, :, ::-1] # Reverses order on last (i.e. channel) dimension. + + then call `extractor.extract_rgb_frame_features(im)` + """ + batch_size = 64 + + def __init__(self, model_dir: str, + pca_dir: str, + select_layer: str = 'PreLogits', + *args, **kwargs): + super().__init__(*args, **kwargs) + + self.model_dir = model_dir + self.pca_dir = pca_dir + self.select_layer = select_layer + self.inception_size_x = 299 + self.inception_size_y = 299 + + def post_init(self): + import tensorflow as tf + from .yt8m_feature_extractor_cores.inception_v3 import inception_v3 + from .yt8m_feature_extractor_cores.inception_utils import inception_arg_scope + import os + os.environ['CUDA_VISIBLE_DEVICES'] = str(get_first_available_gpu()) + + self.pca_mean = np.load(os.path.join(self.pca_dir, 'mean.npy'))[:, 0] + self.pca_eigenvals = np.load(os.path.join(self.pca_dir, 'eigenvals.npy'))[:1024, 0] + self.pca_eigenvecs = np.load(os.path.join(self.pca_dir, 'eigenvecs.npy')).T[:, :1024] + + g = tf.Graph() + with g.as_default(): + arg_scope = inception_arg_scope() + inception_v3.default_image_size = self.inception_size_x + self.inputs = tf.placeholder(tf.float32, (None, + self.inception_size_x, + self.inception_size_y, 3)) + + with tf.contrib.slim.arg_scope(arg_scope): + self.logits, self.end_points = inception_v3(self.inputs, + num_classes=1001, + is_training=False, + dropout_keep_prob=1.0) + + config = tf.ConfigProto(log_device_placement=False) + if self.on_gpu: + config.gpu_options.allow_growth = True + self.sess = tf.Session(config=config) + self.saver = tf.train.Saver() + self.saver.restore(self.sess, self.model_dir) + + def encode(self, img: List['np.ndarray'], *args, **kwargs) -> np.ndarray: + img = [(np.array(Image.fromarray(im).resize((self.inception_size_x, + self.inception_size_y)), dtype=np.float32) * 2 / 255. - 1.) for im + in img] + + @batching + def _encode(_, data): + def _pca(data): + data = np.squeeze(data, axis=(1, 2)) + data = (data - self.pca_mean).reshape((len(data), 2048)) + data = np.matmul(data, self.pca_eigenvecs) + data = data / np.sqrt(self.pca_eigenvals + 1e-4) + return data + + _, end_points_ = self.sess.run((self.logits, self.end_points), + feed_dict={self.inputs: data}) + + return _pca(end_points_[self.select_layer]) + + return _encode(self, img).astype(np.float32) + diff --git a/gnes/encoder/video/yt8m_feature_extractor_cores/__init__.py b/gnes/encoder/video/yt8m_feature_extractor_cores/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/gnes/encoder/video/yt8m_feature_extractor_cores/inception_utils.py b/gnes/encoder/video/yt8m_feature_extractor_cores/inception_utils.py new file mode 100644 index 00000000..43198a6d --- /dev/null +++ b/gnes/encoder/video/yt8m_feature_extractor_cores/inception_utils.py @@ -0,0 +1,82 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains common code shared by all inception models. + +Usage of arg scope: + with slim.arg_scope(inception_arg_scope()): + logits, end_points = inception.inception_v3(images, num_classes, + is_training=is_training) + +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +slim = tf.contrib.slim + + +def inception_arg_scope(weight_decay=0.00004, + use_batch_norm=True, + batch_norm_decay=0.9997, + batch_norm_epsilon=0.001, + activation_fn=tf.nn.relu, + batch_norm_updates_collections=tf.GraphKeys.UPDATE_OPS, + batch_norm_scale=False): + """Defines the default arg scope for inception models. + + Args: + weight_decay: The weight decay to use for regularizing the model. + use_batch_norm: "If `True`, batch_norm is applied after each convolution. + batch_norm_decay: Decay for batch norm moving average. + batch_norm_epsilon: Small float added to variance to avoid dividing by zero + in batch norm. + activation_fn: Activation function for conv2d. + batch_norm_updates_collections: Collection for the update ops for + batch norm. + batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the + activations in the batch normalization layer. + + Returns: + An `arg_scope` to use for the inception models. + """ + batch_norm_params = { + # Decay for the moving averages. + 'decay': batch_norm_decay, + # epsilon to prevent 0s in variance. + 'epsilon': batch_norm_epsilon, + # collection containing update_ops. + 'updates_collections': batch_norm_updates_collections, + # use fused batch norm if possible. + 'fused': None, + 'scale': batch_norm_scale, + } + if use_batch_norm: + normalizer_fn = slim.batch_norm + normalizer_params = batch_norm_params + else: + normalizer_fn = None + normalizer_params = {} + # Set weight_decay for weights in Conv and FC layers. + with slim.arg_scope([slim.conv2d, slim.fully_connected], + weights_regularizer=slim.l2_regularizer(weight_decay)): + with slim.arg_scope( + [slim.conv2d], + weights_initializer=slim.variance_scaling_initializer(), + activation_fn=activation_fn, + normalizer_fn=normalizer_fn, + normalizer_params=normalizer_params) as sc: + return sc diff --git a/gnes/encoder/video/yt8m_feature_extractor_cores/inception_v3.py b/gnes/encoder/video/yt8m_feature_extractor_cores/inception_v3.py new file mode 100644 index 00000000..57d7964a --- /dev/null +++ b/gnes/encoder/video/yt8m_feature_extractor_cores/inception_v3.py @@ -0,0 +1,561 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains the definition for inception v3 classification network.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +from . import inception_utils + +slim = tf.contrib.slim +trunc_normal = lambda stddev: tf.truncated_normal_initializer(0.0, stddev) + + +def inception_v3_base(inputs, + final_endpoint='Mixed_7c', + min_depth=16, + depth_multiplier=1.0, + scope=None): + """Inception model from http://arxiv.org/abs/1512.00567. + Constructs an Inception v3 network from inputs to the given final endpoint. + This method can construct the network up to the final inception block + Mixed_7c. + Note that the names of the layers in the paper do not correspond to the names + of the endpoints registered by this function although they build the same + network. + Here is a mapping from the old_names to the new names: + Old name | New name + ======================================= + conv0 | Conv2d_1a_3x3 + conv1 | Conv2d_2a_3x3 + conv2 | Conv2d_2b_3x3 + pool1 | MaxPool_3a_3x3 + conv3 | Conv2d_3b_1x1 + conv4 | Conv2d_4a_3x3 + pool2 | MaxPool_5a_3x3 + mixed_35x35x256a | Mixed_5b + mixed_35x35x288a | Mixed_5c + mixed_35x35x288b | Mixed_5d + mixed_17x17x768a | Mixed_6a + mixed_17x17x768b | Mixed_6b + mixed_17x17x768c | Mixed_6c + mixed_17x17x768d | Mixed_6d + mixed_17x17x768e | Mixed_6e + mixed_8x8x1280a | Mixed_7a + mixed_8x8x2048a | Mixed_7b + mixed_8x8x2048b | Mixed_7c + Args: + inputs: a tensor of size [batch_size, height, width, channels]. + final_endpoint: specifies the endpoint to construct the network up to. It + can be one of ['Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', + 'MaxPool_3a_3x3', 'Conv2d_3b_1x1', 'Conv2d_4a_3x3', 'MaxPool_5a_3x3', + 'Mixed_5b', 'Mixed_5c', 'Mixed_5d', 'Mixed_6a', 'Mixed_6b', 'Mixed_6c', + 'Mixed_6d', 'Mixed_6e', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c']. + min_depth: Minimum depth value (number of channels) for all convolution ops. + Enforced when depth_multiplier < 1, and not an active constraint when + depth_multiplier >= 1. + depth_multiplier: Float multiplier for the depth (number of channels) + for all convolution ops. The value must be greater than zero. Typical + usage will be to set this value in (0, 1) to reduce the number of + parameters or computation cost of the model. + scope: Optional variable_scope. + Returns: + tensor_out: output tensor corresponding to the final_endpoint. + end_points: a set of activations for external use, for example summaries or + losses. + Raises: + ValueError: if final_endpoint is not set to one of the predefined values, + or depth_multiplier <= 0 + """ + # end_points will collect relevant activations for external use, for example + # summaries or losses. + end_points = {} + + if depth_multiplier <= 0: + raise ValueError('depth_multiplier is not greater than zero.') + depth = lambda d: max(int(d * depth_multiplier), min_depth) + + with tf.variable_scope(scope, 'InceptionV3', [inputs]): + with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], + stride=1, padding='VALID'): + # 299 x 299 x 3 + end_point = 'Conv2d_1a_3x3' + net = slim.conv2d(inputs, depth(32), [3, 3], stride=2, scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 149 x 149 x 32 + end_point = 'Conv2d_2a_3x3' + net = slim.conv2d(net, depth(32), [3, 3], scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 147 x 147 x 32 + end_point = 'Conv2d_2b_3x3' + net = slim.conv2d(net, depth(64), [3, 3], padding='SAME', scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 147 x 147 x 64 + end_point = 'MaxPool_3a_3x3' + net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 73 x 73 x 64 + end_point = 'Conv2d_3b_1x1' + net = slim.conv2d(net, depth(80), [1, 1], scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 73 x 73 x 80. + end_point = 'Conv2d_4a_3x3' + net = slim.conv2d(net, depth(192), [3, 3], scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 71 x 71 x 192. + end_point = 'MaxPool_5a_3x3' + net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 35 x 35 x 192. + + # Inception blocks + with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], + stride=1, padding='SAME'): + # mixed: 35 x 35 x 256. + end_point = 'Mixed_5b' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(48), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(64), [5, 5], + scope='Conv2d_0b_5x5') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], + scope='Conv2d_0b_3x3') + branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], + scope='Conv2d_0c_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, depth(32), [1, 1], + scope='Conv2d_0b_1x1') + net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed_1: 35 x 35 x 288. + end_point = 'Mixed_5c' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(48), [1, 1], scope='Conv2d_0b_1x1') + branch_1 = slim.conv2d(branch_1, depth(64), [5, 5], + scope='Conv_1_0c_5x5') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(64), [1, 1], + scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], + scope='Conv2d_0b_3x3') + branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], + scope='Conv2d_0c_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, depth(64), [1, 1], + scope='Conv2d_0b_1x1') + net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed_2: 35 x 35 x 288. + end_point = 'Mixed_5d' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(48), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(64), [5, 5], + scope='Conv2d_0b_5x5') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], + scope='Conv2d_0b_3x3') + branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], + scope='Conv2d_0c_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, depth(64), [1, 1], + scope='Conv2d_0b_1x1') + net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed_3: 17 x 17 x 768. + end_point = 'Mixed_6a' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(384), [3, 3], stride=2, + padding='VALID', scope='Conv2d_1a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(96), [3, 3], + scope='Conv2d_0b_3x3') + branch_1 = slim.conv2d(branch_1, depth(96), [3, 3], stride=2, + padding='VALID', scope='Conv2d_1a_1x1') + with tf.variable_scope('Branch_2'): + branch_2 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', + scope='MaxPool_1a_3x3') + net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed4: 17 x 17 x 768. + end_point = 'Mixed_6b' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(128), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(128), [1, 7], + scope='Conv2d_0b_1x7') + branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], + scope='Conv2d_0c_7x1') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(128), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(128), [7, 1], + scope='Conv2d_0b_7x1') + branch_2 = slim.conv2d(branch_2, depth(128), [1, 7], + scope='Conv2d_0c_1x7') + branch_2 = slim.conv2d(branch_2, depth(128), [7, 1], + scope='Conv2d_0d_7x1') + branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], + scope='Conv2d_0e_1x7') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], + scope='Conv2d_0b_1x1') + net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed_5: 17 x 17 x 768. + end_point = 'Mixed_6c' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(160), [1, 7], + scope='Conv2d_0b_1x7') + branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], + scope='Conv2d_0c_7x1') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(160), [7, 1], + scope='Conv2d_0b_7x1') + branch_2 = slim.conv2d(branch_2, depth(160), [1, 7], + scope='Conv2d_0c_1x7') + branch_2 = slim.conv2d(branch_2, depth(160), [7, 1], + scope='Conv2d_0d_7x1') + branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], + scope='Conv2d_0e_1x7') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], + scope='Conv2d_0b_1x1') + net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # mixed_6: 17 x 17 x 768. + end_point = 'Mixed_6d' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(160), [1, 7], + scope='Conv2d_0b_1x7') + branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], + scope='Conv2d_0c_7x1') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(160), [7, 1], + scope='Conv2d_0b_7x1') + branch_2 = slim.conv2d(branch_2, depth(160), [1, 7], + scope='Conv2d_0c_1x7') + branch_2 = slim.conv2d(branch_2, depth(160), [7, 1], + scope='Conv2d_0d_7x1') + branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], + scope='Conv2d_0e_1x7') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], + scope='Conv2d_0b_1x1') + net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed_7: 17 x 17 x 768. + end_point = 'Mixed_6e' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(192), [1, 7], + scope='Conv2d_0b_1x7') + branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], + scope='Conv2d_0c_7x1') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(192), [7, 1], + scope='Conv2d_0b_7x1') + branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], + scope='Conv2d_0c_1x7') + branch_2 = slim.conv2d(branch_2, depth(192), [7, 1], + scope='Conv2d_0d_7x1') + branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], + scope='Conv2d_0e_1x7') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], + scope='Conv2d_0b_1x1') + net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed_8: 8 x 8 x 1280. + end_point = 'Mixed_7a' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + branch_0 = slim.conv2d(branch_0, depth(320), [3, 3], stride=2, + padding='VALID', scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(192), [1, 7], + scope='Conv2d_0b_1x7') + branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], + scope='Conv2d_0c_7x1') + branch_1 = slim.conv2d(branch_1, depth(192), [3, 3], stride=2, + padding='VALID', scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', + scope='MaxPool_1a_3x3') + net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # mixed_9: 8 x 8 x 2048. + end_point = 'Mixed_7b' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(320), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(384), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = tf.concat(axis=3, values=[ + slim.conv2d(branch_1, depth(384), [1, 3], scope='Conv2d_0b_1x3'), + slim.conv2d(branch_1, depth(384), [3, 1], scope='Conv2d_0b_3x1')]) + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(448), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d( + branch_2, depth(384), [3, 3], scope='Conv2d_0b_3x3') + branch_2 = tf.concat(axis=3, values=[ + slim.conv2d(branch_2, depth(384), [1, 3], scope='Conv2d_0c_1x3'), + slim.conv2d(branch_2, depth(384), [3, 1], scope='Conv2d_0d_3x1')]) + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d( + branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1') + net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed_10: 8 x 8 x 2048. + end_point = 'Mixed_7c' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(320), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(384), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = tf.concat(axis=3, values=[ + slim.conv2d(branch_1, depth(384), [1, 3], scope='Conv2d_0b_1x3'), + slim.conv2d(branch_1, depth(384), [3, 1], scope='Conv2d_0c_3x1')]) + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(448), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d( + branch_2, depth(384), [3, 3], scope='Conv2d_0b_3x3') + branch_2 = tf.concat(axis=3, values=[ + slim.conv2d(branch_2, depth(384), [1, 3], scope='Conv2d_0c_1x3'), + slim.conv2d(branch_2, depth(384), [3, 1], scope='Conv2d_0d_3x1')]) + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d( + branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1') + net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + raise ValueError('Unknown final endpoint %s' % final_endpoint) + + +def inception_v3(inputs, + num_classes=1000, + is_training=True, + dropout_keep_prob=0.8, + min_depth=16, + depth_multiplier=1.0, + prediction_fn=slim.softmax, + spatial_squeeze=True, + reuse=None, + create_aux_logits=True, + scope='InceptionV3', + global_pool=False): + """Inception model from http://arxiv.org/abs/1512.00567. + "Rethinking the Inception Architecture for Computer Vision" + Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, + Zbigniew Wojna. + With the default arguments this method constructs the exact model defined in + the paper. However, one can experiment with variations of the inception_v3 + network by changing arguments dropout_keep_prob, min_depth and + depth_multiplier. + The default image size used to train this network is 299x299. + Args: + inputs: a tensor of size [batch_size, height, width, channels]. + num_classes: number of predicted classes. If 0 or None, the logits layer + is omitted and the input features to the logits layer (before dropout) + are returned instead. + is_training: whether is training or not. + dropout_keep_prob: the percentage of activation values that are retained. + min_depth: Minimum depth value (number of channels) for all convolution ops. + Enforced when depth_multiplier < 1, and not an active constraint when + depth_multiplier >= 1. + depth_multiplier: Float multiplier for the depth (number of channels) + for all convolution ops. The value must be greater than zero. Typical + usage will be to set this value in (0, 1) to reduce the number of + parameters or computation cost of the model. + prediction_fn: a function to get predictions out of logits. + spatial_squeeze: if True, logits is of shape [B, C], if false logits is of + shape [B, 1, 1, C], where B is batch_size and C is number of classes. + reuse: whether or not the network and its variables should be reused. To be + able to reuse 'scope' must be given. + create_aux_logits: Whether to create the auxiliary logits. + scope: Optional variable_scope. + global_pool: Optional boolean flag to control the avgpooling before the + logits layer. If false or unset, pooling is done with a fixed window + that reduces default-sized inputs to 1x1, while larger inputs lead to + larger outputs. If true, any input size is pooled down to 1x1. + Returns: + net: a Tensor with the logits (pre-softmax activations) if num_classes + is a non-zero integer, or the non-dropped-out input to the logits layer + if num_classes is 0 or None. + end_points: a dictionary from components of the network to the corresponding + activation. + Raises: + ValueError: if 'depth_multiplier' is less than or equal to zero. + """ + if depth_multiplier <= 0: + raise ValueError('depth_multiplier is not greater than zero.') + depth = lambda d: max(int(d * depth_multiplier), min_depth) + + with tf.variable_scope(scope, 'InceptionV3', [inputs], reuse=reuse) as scope: + with slim.arg_scope([slim.batch_norm, slim.dropout], + is_training=is_training): + net, end_points = inception_v3_base( + inputs, scope=scope, min_depth=min_depth, + depth_multiplier=depth_multiplier) + + # Auxiliary Head logits + if create_aux_logits and num_classes: + with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], + stride=1, padding='SAME'): + aux_logits = end_points['Mixed_6e'] + with tf.variable_scope('AuxLogits'): + aux_logits = slim.avg_pool2d( + aux_logits, [5, 5], stride=3, padding='VALID', + scope='AvgPool_1a_5x5') + aux_logits = slim.conv2d(aux_logits, depth(128), [1, 1], + scope='Conv2d_1b_1x1') + + # Shape of feature map before the final layer. + kernel_size = _reduced_kernel_size_for_small_input( + aux_logits, [5, 5]) + aux_logits = slim.conv2d( + aux_logits, depth(768), kernel_size, + weights_initializer=trunc_normal(0.01), + padding='VALID', scope='Conv2d_2a_{}x{}'.format(*kernel_size)) + aux_logits = slim.conv2d( + aux_logits, num_classes, [1, 1], activation_fn=None, + normalizer_fn=None, weights_initializer=trunc_normal(0.001), + scope='Conv2d_2b_1x1') + if spatial_squeeze: + aux_logits = tf.squeeze(aux_logits, [1, 2], name='SpatialSqueeze') + end_points['AuxLogits'] = aux_logits + + # Final pooling and prediction + with tf.variable_scope('Logits'): + if global_pool: + # Global average pooling. + net = tf.reduce_mean(net, [1, 2], keep_dims=True, name='GlobalPool') + end_points['global_pool'] = net + else: + # Pooling with a fixed kernel size. + kernel_size = _reduced_kernel_size_for_small_input(net, [8, 8]) + net = slim.avg_pool2d(net, kernel_size, padding='VALID', + scope='AvgPool_1a_{}x{}'.format(*kernel_size)) + end_points['AvgPool_1a'] = net + if not num_classes: + return net, end_points + # 1 x 1 x 2048 + net = slim.dropout(net, keep_prob=dropout_keep_prob, scope='Dropout_1b') + end_points['PreLogits'] = net + # 2048 + logits = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, + normalizer_fn=None, scope='Conv2d_1c_1x1') + if spatial_squeeze: + logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze') + # 1000 + end_points['Logits'] = logits + end_points['Predictions'] = prediction_fn(logits, scope='Predictions') + return logits, end_points +inception_v3.default_image_size = 299 + + +def _reduced_kernel_size_for_small_input(input_tensor, kernel_size): + """Define kernel size which is automatically reduced for small input. + If the shape of the input images is unknown at graph construction time this + function assumes that the input images are is large enough. + Args: + input_tensor: input tensor of size [batch_size, height, width, channels]. + kernel_size: desired kernel size of length 2: [kernel_height, kernel_width] + Returns: + a tensor with the kernel size. + TODO(jrru): Make this function work with unknown shapes. Theoretically, this + can be done with the code below. Problems are two-fold: (1) If the shape was + known, it will be lost. (2) inception.slim.ops._two_element_tuple cannot + handle tensors that define the kernel size. + shape = tf.shape(input_tensor) + return = tf.stack([tf.minimum(shape[1], kernel_size[0]), + tf.minimum(shape[2], kernel_size[1])]) + """ + shape = input_tensor.get_shape().as_list() + if shape[1] is None or shape[2] is None: + kernel_size_out = kernel_size + else: + kernel_size_out = [min(shape[1], kernel_size[0]), + min(shape[2], kernel_size[1])] + return kernel_size_out + + +inception_v3_arg_scope = inception_utils.inception_arg_scope \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py index 2f988957..da47a71d 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -52,7 +52,9 @@ def line2pb_doc(line: str, doc_id: int = 0, deliminator: str = r'[.。!?!?]+ 'FASTERRCNN_MODEL': '/', 'GNES_PROFILING': '', 'TORCH_TRANSFORMERS_MODEL': '/torch_transformer', - 'VGGISH_MODEL': '/lab/vggish' + 'VGGISH_MODEL': '/lab/vggish', + 'YT8M_MODEL': '/lab/yt8m', + 'YT8MPCA_MODEL': '/lab/yt8m_pca' }, 'idc-165': { 'BERT_CI_PORT': 7125, @@ -71,7 +73,9 @@ def line2pb_doc(line: str, doc_id: int = 0, deliminator: str = r'[.。!?!?]+ 'FASTERRCNN_MODEL': '/ext_data/image_preprocessor', 'GNES_PROFILING': '', 'TORCH_TRANSFORMERS_MODEL': '/ext_data/torch_transformer', - 'VGGISH_MODEL': '/ext_data/lab/vggish' + 'VGGISH_MODEL': '/ext_data/lab/vggish', + 'YT8M_MODEL': '/ext_data/lab/yt8m', + 'YT8MPCA_MODEL': '/ext_data/lab/yt8m_pca' } } diff --git a/tests/test_yt8m_feature_extractor.py b/tests/test_yt8m_feature_extractor.py new file mode 100644 index 00000000..e5326035 --- /dev/null +++ b/tests/test_yt8m_feature_extractor.py @@ -0,0 +1,26 @@ +import os +import unittest +import numpy as np + +from gnes.encoder.base import BaseEncoder + + +class TestYT8MFeatureExtractor(unittest.TestCase): + + @unittest.skip + def setUp(self): + dirname = os.path.dirname(__file__) + self.test_img = [np.random.rand(299, 299, 3).astype(np.uint8), + np.random.rand(299, 299, 3).astype(np.uint8)] + self.inception_yaml = os.path.join(dirname, 'yaml', 'yt8m_feature.yml') + + @unittest.skip + def test_inception_encoding(self): + self.encoder = BaseEncoder.load_yaml(self.inception_yaml) + # for test_img in self.test_img: + vec = self.encoder.encode(self.test_img) + print("the length of data now is:", len(self.test_img)) + print("after encoding", vec.shape) + + self.assertEqual(vec.shape[0], len(self.test_img)) + self.assertEqual(vec.shape[1], 1024) diff --git a/tests/yaml/yt8m_feature.yml b/tests/yaml/yt8m_feature.yml new file mode 100644 index 00000000..02b15759 --- /dev/null +++ b/tests/yaml/yt8m_feature.yml @@ -0,0 +1,7 @@ +!YouTube8MFeatureExtractor +parameters: + model_dir: ${YT8M_MODEL} + pca_dir: ${YT8MPCA_MODEL} +gnes_config: + name: yt8m + is_trained: true \ No newline at end of file