refactor(ffmpeg): refactor ffmpeg again

gnes-ai · Aug 26, 2019 · 10cef54 · 10cef54
1 parent 71cb36f
commit 10cef54
Show file tree

Hide file tree

Showing 5 changed files with 362 additions and 135 deletions.
diff --git a/gnes/preprocessor/io_utils/audio.py b/gnes/preprocessor/io_utils/audio.py
@@ -19,33 +19,51 @@
 import numpy as np
 import soundfile as sf
 
+from .ffmpeg import compile_args
+from .helper import run_command, run_command_async
+
 from typing import List
 
 DEFAULT_SILENCE_DURATION = 0.3
 DEFAULT_SILENCE_THRESHOLD = -60
 
 
-def capture_audio(filename: str = 'pipe:',
-                  video_data: bytes = None,
+def capture_audio(input_fn: str = 'pipe:',
+                  input_data: bytes = None,
                   bits_per_raw_sample: int = 16,
                   sample_rate: int = 16000,
+                  start_time: float = None,
+                  end_time: float = None,
                   **kwargs) -> List['np.ndarray']:
 
-    capture_stdin = (filename == 'pipe:')
-    if capture_stdin and video_data is None:
+    capture_stdin = (input_fn == 'pipe:')
+    if capture_stdin and input_data is None:
         raise ValueError(
             "the buffered video data for stdin should not be empty")
 
-    stream = ffmpeg.input(filename)
-    stream = stream.output(
-        'pipe:',
-        format='wav',
-        bits_per_raw_sample=bits_per_raw_sample,
-        ac=1,
-        ar=16000)
+    input_kwargs = {}
+    if start_time is not None:
+        input_kwargs['ss'] = str(start_time)
+    else:
+        start_time = 0.
+    if end_time is not None:
+        input_kwargs['t'] = str(end_time - start_time)
+
+    output_kwargs = {
+        'format': 'wav',
+        'bits_per_raw_sample': bits_per_raw_sample,
+        'ac': 1,
+        'ar': sample_rate
+    }
+
+    cmd_args = compile_args(
+        input_fn=input_fn,
+        input_options=input_kwargs,
+        output_options=output_kwargs,
+        overwrite_output=True)
 
-    stdout, _ = stream.run(
-        input=video_data, capture_stdout=True, capture_stderr=True)
+    stdout, _ = run_command(
+        cmd_args, input=input_data, pipe_stdout=True, pipe_stderr=True)
 
     audio_stream = io.BytesIO(stdout)
     audio_data, sample_rate = sf.read(audio_stream)
@@ -56,8 +74,8 @@ def capture_audio(filename: str = 'pipe:',
     return audio_data
 
 
-def get_chunk_times(filename: str = 'pipe:',
-                    video_data: bytes = None,
+def get_chunk_times(input_fn: str = 'pipe:',
+                    input_data: bytes = None,
                     silence_threshold: float = DEFAULT_SILENCE_THRESHOLD,
                     silence_duration: float = DEFAULT_SILENCE_DURATION,
                     start_time: float = None,
@@ -78,15 +96,20 @@ def get_chunk_times(filename: str = 'pipe:',
     if end_time is not None:
         input_kwargs['t'] = end_time - start_time
 
-    stream = ffmpeg.input(filename, **input_kwargs)
-    stream = stream.filter(
-        'silencedetect',
-        n='{}dB'.format(silence_threshold),
-        d=silence_duration)
-    stream = stream.output('pipe:', format='null')
+    au_filters = [
+        'silencedetect=noise={}dB:d={}'.format(silence_threshold,
+                                               silence_duration)
+    ]
 
-    stdout, stderr = stream.run(
-        input=video_data, capture_stdout=True, capture_stderr=True)
+    output_kwargs = {'format': 'null'}
+    cmd_args = compile_args(
+        input_fn=input_fn,
+        input_options=input_kwargs,
+        audio_filters=au_filters,
+        output_options=output_kwargs)
+
+    stdout, stderr = run_command(
+        cmd_args, input=input_data, pipe_stdout=True, pipe_stderr=True)
 
     lines = stderr.decode().splitlines()
 
@@ -121,28 +144,41 @@ def get_chunk_times(filename: str = 'pipe:',
     return list(zip(chunk_starts, chunk_ends))
 
 
-def split_audio(filename: str = 'pipe:',
-                video_data: bytes = None,
+def split_audio(input_fn: str = 'pipe:',
+                input_data: bytes = None,
                 silence_threshold=DEFAULT_SILENCE_THRESHOLD,
                 silence_duration=DEFAULT_SILENCE_DURATION,
                 start_time: float = None,
                 end_time: float = None,
                 verbose=False):
     chunk_times = get_chunk_times(
-        filename,
-        video_data=video_data,
+        input_fn,
+        input_data=input_data,
         silence_threshold=silence_threshold,
         silence_duration=silence_duration,
         start_time=start_time,
         end_time=end_time)
-
     audio_chunks = list()
     for i, (start_time, end_time) in enumerate(chunk_times):
         time = end_time - start_time
-        stream = ffmpeg.input(filename, ss=start_time, t=time)
-        stream = stream.output('pipe:', format='wav')
-        stdout, _ = stream.run(
-            input=video_data, capture_stdout=True, capture_stderr=True)
+        if time < 0:
+            continue
+        input_kwargs = {
+            'ss': start_time,
+            't': time
+        }
+
+        output_kwargs = {
+            'format': 'wav'
+        }
+
+        cmd_args = compile_args(
+            input_fn=input_fn,
+            input_options=input_kwargs,
+            output_options=output_kwargs)
+
+        stdout, stderr = run_command(
+            cmd_args, input=input_data, pipe_stdout=True, pipe_stderr=True)
 
         audio_stream = io.BytesIO(stdout)
         audio_data, sample_rate = sf.read(audio_stream)

diff --git a/gnes/preprocessor/io_utils/ffmpeg.py b/gnes/preprocessor/io_utils/ffmpeg.py
@@ -0,0 +1,140 @@
+#  Tencent is pleased to support the open source community by making GNES available.
+#
+#  Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+#  Licensed under the Apache License, Version 2.0 (the 'License');
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an 'AS IS' BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import re
+from .helper import kwargs_to_cmd_args, run_command, run_command_async
+
+VIDEO_DUR_PATTERN = re.compile(r".*Duration: (\d+):(\d+):(\d+)", re.DOTALL)
+VIDEO_INFO_PATTERN = re.compile(
+    r'.*Stream #0:(\d+)(?:\(\w+\))?: Video: (\w+).*, (yuv\w+)[(,].* (\d+)x(\d+).* (\d+)(\.\d.)? fps',
+    re.DOTALL)
+AUDIO_INFO_PATTERN = re.compile(
+    r'^\s+Stream #0:(?P<stream>\d+)(\((?P<lang>\w+)\))?: Audio: (?P<format>\w+).*?(?P<default>\(default\))?$',
+    re.MULTILINE)
+STREAM_SUBTITLE_PATTERN = re.compile(
+    r'^\s+Stream #0:(?P<stream>\d+)(\((?P<lang>\w+)\))?: Subtitle:',
+    re.MULTILINE)
+
+
+def parse_media_details(infos):
+    video_dur_match = VIDEO_DUR_PATTERN.match(infos)
+    dur_hrs, dur_mins, dur_secs = video_dur_match.group(1, 2, 3)
+
+    video_info_match = VIDEO_INFO_PATTERN.match(infos)
+    codec, pix_fmt, res_width, res_height, fps = video_info_match.group(
+        2, 3, 4, 5, 6)
+
+    audio_tracks = list()
+    for audio_match in AUDIO_INFO_PATTERN.finditer(infos):
+        ainfo = audio_match.groupdict()
+        if ainfo['lang'] is None:
+            ainfo['lang'] = 'und'
+        audio_tracks.append(ainfo)
+
+    medio_info = {
+        'vcodec': codec,
+        'frame_width': int(res_width),
+        'frame_height': int(res_height),
+        'duration': (int(dur_hrs) * 3600 + int(dur_mins) * 60 + int(dur_secs)),
+        'fps': int(fps),
+        'pix_fmt': pix_fmt,
+        'audio': audio_tracks,
+    }
+    return medio_info
+
+
+def compile_args(input_fn: str = 'pipe:',
+                 output_fn: str = 'pipe:',
+                 video_filters: str = [],
+                 audio_filters: str = [],
+                 input_options=dict(),
+                 output_options=dict(),
+                 overwrite_output: bool = True):
+    """Wrapper for various `FFmpeg <https://www.ffmpeg.org/>`_ related applications (ffmpeg,
+    ffprobe).
+    """
+    args = ['ffmpeg']
+
+    input_args = []
+    fmt = input_options.pop('format', None)
+    if fmt:
+        input_args += ['-f', fmt]
+
+    input_args += kwargs_to_cmd_args(input_options)
+    input_args += ['-i', input_fn]
+
+    vf_args = []
+    if len(video_filters) > 0:
+        vf_args = ['-vf', ','.join(video_filters)]
+
+    af_args = []
+    if len(audio_filters) > 0:
+        af_args = ['-af', ','.join(audio_filters)]
+
+    output_args = []
+
+    fmt = output_options.pop('format', None)
+    if fmt:
+        output_args += ['-f', fmt]
+    video_bitrate = output_options.pop('video_bitrate', None)
+    if video_bitrate:
+        output_args += ['-b:v', str(video_bitrate)]
+    audio_bitrate = output_options.pop('audio_bitrate', None)
+    if audio_bitrate:
+        output_args += ['-b:a', str(audio_bitrate)]
+    output_args += kwargs_to_cmd_args(output_options)
+
+    output_args += [output_fn]
+
+    args += input_args + vf_args + af_args + output_args
+
+    if overwrite_output:
+        args += ['-y']
+
+    return args
+
+
+def probe(input_fn: str):
+    command = [
+        'ffprobe', '-v', 'fatal', '-show_entries',
+        'stream=width,height,r_frame_rate,duration', '-of',
+        'default=noprint_wrappers=1:nokey=1', input_fn, '-sexagesimal'
+    ]
+    out, err = run_command(command, pipe_stdout=True, pipe_stderr=True)
+
+    out = out.decode().split('\n')
+    return {
+        'file': input_fn,
+        'width': int(out[0]),
+        'height': int(out[1]),
+        'fps': float(out[2].split('/')[0]) / float(out[2].split('/')[1]),
+        'duration': out[3]
+    }
+
+
+def get_media_meta(input_fn: str = 'pipe:',
+                   input_data: bytes = None,
+                   input_options=dict()):
+    cmd_args = ['ffmpeg']
+
+    fmt = input_options.pop('format', None)
+    if fmt:
+        cmd_args += ['-f', fmt]
+    cmd_args += ['-i', input_fn]
+
+    cmd_args += ['-f', 'ffmetadata', 'pipe:']
+    out, err = run_command(
+        cmd_args, input=input_data, pipe_stdout=True, pipe_stderr=True)
+    return parse_media_details(err.decode())
diff --git a/gnes/preprocessor/io_utils/gif.py b/gnes/preprocessor/io_utils/gif.py
@@ -17,8 +17,9 @@
 import numpy as np
 import subprocess as sp
 import tempfile
-import ffmpeg
-from .helper import extract_frame_size
+
+from .ffmpeg import parse_media_details
+
 
 
 def decode_gif(data: bytes, fps: int = -1,
@@ -35,26 +36,23 @@ def decode_gif(data: bytes, fps: int = -1,
 
         out, err = stream.run(capture_stdout=True, capture_stderr=True)
 
-        width, height = extract_frame_size(err.decode())
+        meta_info = parse_media_details(err.decode())
+        width = meta_info['frame_width']
+        height = meta_info['frame_height']
 
         depth = 3
         if pix_fmt == 'rgba':
             depth = 4
 
         frames = np.frombuffer(out,
                                np.uint8).reshape([-1, height, width, depth])
-        return list(frames)
+        return frames
 
 
 def encode_gif(
-        images: np.ndarray,
-        scale: str,
+        images: List[np.ndarray],
         fps: int,
         pix_fmt: str = 'rgb24'):
-    """
-    https://superuser.com/questions/556029/how-do-i-convert-a-video-to-gif-using-ffmpeg-with-reasonable-quality
-    https://gist.github.com/alexlee-gk/38916bf524dc75ca1b988d113aa30710
-    """
 
     cmd = [
         'ffmpeg', '-y', '-f', 'rawvideo', '-vcodec', 'rawvideo', '-r',