Merge pull request #201 from gnes-ai/yt8m_feature_extractor_

feat(encoder): add yt8m feature extractor
gnes-ai · Sep 4, 2019 · 909a44b · 909a44b
2 parents 93a43f5 + 50a944b
commit 909a44b
Show file tree

Hide file tree

Showing 8 changed files with 805 additions and 3 deletions.
diff --git a/gnes/encoder/__init__.py b/gnes/encoder/__init__.py
@@ -42,7 +42,8 @@
     'MfccEncoder': 'audio.mfcc',
     'PoolingEncoder': 'numeric.pooling',
     'PyTorchTransformers': 'text.transformer',
-    'VggishEncoder': 'audio.vggish'
+    'VggishEncoder': 'audio.vggish',
+    'YouTube8MFeatureExtractor': 'video.yt8m_feature_extractor'
 }
 
 register_all_class(_cls2file_map, 'encoder')
diff --git a/gnes/encoder/video/yt8m_feature_extractor.py b/gnes/encoder/video/yt8m_feature_extractor.py
@@ -0,0 +1,121 @@
+#  Tencent is pleased to support the open source community by making GNES available.
+#
+#  Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from typing import List
+import numpy as np
+from PIL import Image
+from ..base import BaseVideoEncoder
+from ...helper import batching, get_first_available_gpu
+
+
+class YouTube8MFeatureExtractor(BaseVideoEncoder):
+    """Extracts YouTube8M features for RGB frames.
+
+    First time constructing this class will create directory `yt8m` inside your
+    home directory, and will download inception model (85 MB) and YouTube8M PCA
+    matrix (15 MB). If you want to use another directory, then pass it to argument
+    `model_dir` of constructor.
+
+    If the model_dir exist and contains the necessary files, then files will be
+    re-used without download.
+
+    Usage Example:
+
+        from PIL import Image
+        import numpy
+
+        # Instantiate extractor. Slow if called first time on your machine, as it
+        # needs to download 100 MB.
+        extractor = YouTube8MFeatureExtractor()
+
+        image_file = os.path.join(extractor._model_dir, 'cropped_panda.jpg')
+
+        im = numpy.array(Image.open(image_file))
+        features = extractor.extract_rgb_frame_features(im)
+
+    ** Note: OpenCV reverses the order of channels (i.e. orders channels as BGR
+    instead of RGB). If you are using OpenCV, then you must do:
+
+        im = im[:, :, ::-1]  # Reverses order on last (i.e. channel) dimension.
+
+    then call `extractor.extract_rgb_frame_features(im)`
+    """
+    batch_size = 64
+
+    def __init__(self, model_dir: str,
+                 pca_dir: str,
+                 select_layer: str = 'PreLogits',
+                 *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.model_dir = model_dir
+        self.pca_dir = pca_dir
+        self.select_layer = select_layer
+        self.inception_size_x = 299
+        self.inception_size_y = 299
+
+    def post_init(self):
+        import tensorflow as tf
+        from .yt8m_feature_extractor_cores.inception_v3 import inception_v3
+        from .yt8m_feature_extractor_cores.inception_utils import inception_arg_scope
+        import os
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(get_first_available_gpu())
+
+        self.pca_mean = np.load(os.path.join(self.pca_dir, 'mean.npy'))[:, 0]
+        self.pca_eigenvals = np.load(os.path.join(self.pca_dir, 'eigenvals.npy'))[:1024, 0]
+        self.pca_eigenvecs = np.load(os.path.join(self.pca_dir, 'eigenvecs.npy')).T[:, :1024]
+
+        g = tf.Graph()
+        with g.as_default():
+            arg_scope = inception_arg_scope()
+            inception_v3.default_image_size = self.inception_size_x
+            self.inputs = tf.placeholder(tf.float32, (None,
+                                                      self.inception_size_x,
+                                                      self.inception_size_y, 3))
+
+            with tf.contrib.slim.arg_scope(arg_scope):
+                self.logits, self.end_points = inception_v3(self.inputs,
+                                                            num_classes=1001,
+                                                            is_training=False,
+                                                            dropout_keep_prob=1.0)
+
+            config = tf.ConfigProto(log_device_placement=False)
+            if self.on_gpu:
+                config.gpu_options.allow_growth = True
+            self.sess = tf.Session(config=config)
+            self.saver = tf.train.Saver()
+            self.saver.restore(self.sess, self.model_dir)
+
+    def encode(self, img: List['np.ndarray'], *args, **kwargs) -> np.ndarray:
+        img = [(np.array(Image.fromarray(im).resize((self.inception_size_x,
+                                                     self.inception_size_y)), dtype=np.float32) * 2 / 255. - 1.) for im
+               in img]
+
+        @batching
+        def _encode(_, data):
+            def _pca(data):
+                data = np.squeeze(data, axis=(1, 2))
+                data = (data - self.pca_mean).reshape((len(data), 2048))
+                data = np.matmul(data, self.pca_eigenvecs)
+                data = data / np.sqrt(self.pca_eigenvals + 1e-4)
+                return data
+
+            _, end_points_ = self.sess.run((self.logits, self.end_points),
+                                           feed_dict={self.inputs: data})
+
+            return _pca(end_points_[self.select_layer])
+
+        return _encode(self, img).astype(np.float32)
+
diff --git a/gnes/encoder/video/yt8m_feature_extractor_cores/__init__.py b/gnes/encoder/video/yt8m_feature_extractor_cores/__init__.py
diff --git a/gnes/encoder/video/yt8m_feature_extractor_cores/inception_utils.py b/gnes/encoder/video/yt8m_feature_extractor_cores/inception_utils.py
@@ -0,0 +1,82 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains common code shared by all inception models.
+
+Usage of arg scope:
+  with slim.arg_scope(inception_arg_scope()):
+    logits, end_points = inception.inception_v3(images, num_classes,
+                                                is_training=is_training)
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+slim = tf.contrib.slim
+
+
+def inception_arg_scope(weight_decay=0.00004,
+                        use_batch_norm=True,
+                        batch_norm_decay=0.9997,
+                        batch_norm_epsilon=0.001,
+                        activation_fn=tf.nn.relu,
+                        batch_norm_updates_collections=tf.GraphKeys.UPDATE_OPS,
+                        batch_norm_scale=False):
+    """Defines the default arg scope for inception models.
+
+    Args:
+      weight_decay: The weight decay to use for regularizing the model.
+      use_batch_norm: "If `True`, batch_norm is applied after each convolution.
+      batch_norm_decay: Decay for batch norm moving average.
+      batch_norm_epsilon: Small float added to variance to avoid dividing by zero
+        in batch norm.
+      activation_fn: Activation function for conv2d.
+      batch_norm_updates_collections: Collection for the update ops for
+        batch norm.
+      batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the
+        activations in the batch normalization layer.
+
+    Returns:
+      An `arg_scope` to use for the inception models.
+    """
+    batch_norm_params = {
+        # Decay for the moving averages.
+        'decay': batch_norm_decay,
+        # epsilon to prevent 0s in variance.
+        'epsilon': batch_norm_epsilon,
+        # collection containing update_ops.
+        'updates_collections': batch_norm_updates_collections,
+        # use fused batch norm if possible.
+        'fused': None,
+        'scale': batch_norm_scale,
+    }
+    if use_batch_norm:
+        normalizer_fn = slim.batch_norm
+        normalizer_params = batch_norm_params
+    else:
+        normalizer_fn = None
+        normalizer_params = {}
+    # Set weight_decay for weights in Conv and FC layers.
+    with slim.arg_scope([slim.conv2d, slim.fully_connected],
+                        weights_regularizer=slim.l2_regularizer(weight_decay)):
+        with slim.arg_scope(
+                [slim.conv2d],
+                weights_initializer=slim.variance_scaling_initializer(),
+                activation_fn=activation_fn,
+                normalizer_fn=normalizer_fn,
+                normalizer_params=normalizer_params) as sc:
+            return sc