diff --git a/gnes/preprocessor/io_utils/audio.py b/gnes/preprocessor/io_utils/audio.py
index 7828715b..c39acf48 100644
--- a/gnes/preprocessor/io_utils/audio.py
+++ b/gnes/preprocessor/io_utils/audio.py
@@ -19,33 +19,51 @@
 import numpy as np
 import soundfile as sf
 
+from .ffmpeg import compile_args
+from .helper import run_command, run_command_async
+
 from typing import List
 
 DEFAULT_SILENCE_DURATION = 0.3
 DEFAULT_SILENCE_THRESHOLD = -60
 
 
-def capture_audio(filename: str = 'pipe:',
-                  video_data: bytes = None,
+def capture_audio(input_fn: str = 'pipe:',
+                  input_data: bytes = None,
                   bits_per_raw_sample: int = 16,
                   sample_rate: int = 16000,
+                  start_time: float = None,
+                  end_time: float = None,
                   **kwargs) -> List['np.ndarray']:
 
-    capture_stdin = (filename == 'pipe:')
-    if capture_stdin and video_data is None:
+    capture_stdin = (input_fn == 'pipe:')
+    if capture_stdin and input_data is None:
         raise ValueError(
             "the buffered video data for stdin should not be empty")
 
-    stream = ffmpeg.input(filename)
-    stream = stream.output(
-        'pipe:',
-        format='wav',
-        bits_per_raw_sample=bits_per_raw_sample,
-        ac=1,
-        ar=16000)
+    input_kwargs = {}
+    if start_time is not None:
+        input_kwargs['ss'] = str(start_time)
+    else:
+        start_time = 0.
+    if end_time is not None:
+        input_kwargs['t'] = str(end_time - start_time)
+
+    output_kwargs = {
+        'format': 'wav',
+        'bits_per_raw_sample': bits_per_raw_sample,
+        'ac': 1,
+        'ar': sample_rate
+    }
+
+    cmd_args = compile_args(
+        input_fn=input_fn,
+        input_options=input_kwargs,
+        output_options=output_kwargs,
+        overwrite_output=True)
 
-    stdout, _ = stream.run(
-        input=video_data, capture_stdout=True, capture_stderr=True)
+    stdout, _ = run_command(
+        cmd_args, input=input_data, pipe_stdout=True, pipe_stderr=True)
 
     audio_stream = io.BytesIO(stdout)
     audio_data, sample_rate = sf.read(audio_stream)
@@ -56,8 +74,8 @@ def capture_audio(filename: str = 'pipe:',
     return audio_data
 
 
-def get_chunk_times(filename: str = 'pipe:',
-                    video_data: bytes = None,
+def get_chunk_times(input_fn: str = 'pipe:',
+                    input_data: bytes = None,
                     silence_threshold: float = DEFAULT_SILENCE_THRESHOLD,
                     silence_duration: float = DEFAULT_SILENCE_DURATION,
                     start_time: float = None,
@@ -78,15 +96,20 @@ def get_chunk_times(filename: str = 'pipe:',
     if end_time is not None:
         input_kwargs['t'] = end_time - start_time
 
-    stream = ffmpeg.input(filename, **input_kwargs)
-    stream = stream.filter(
-        'silencedetect',
-        n='{}dB'.format(silence_threshold),
-        d=silence_duration)
-    stream = stream.output('pipe:', format='null')
+    au_filters = [
+        'silencedetect=noise={}dB:d={}'.format(silence_threshold,
+                                               silence_duration)
+    ]
 
-    stdout, stderr = stream.run(
-        input=video_data, capture_stdout=True, capture_stderr=True)
+    output_kwargs = {'format': 'null'}
+    cmd_args = compile_args(
+        input_fn=input_fn,
+        input_options=input_kwargs,
+        audio_filters=au_filters,
+        output_options=output_kwargs)
+
+    stdout, stderr = run_command(
+        cmd_args, input=input_data, pipe_stdout=True, pipe_stderr=True)
 
     lines = stderr.decode().splitlines()
 
@@ -121,28 +144,41 @@ def get_chunk_times(filename: str = 'pipe:',
     return list(zip(chunk_starts, chunk_ends))
 
 
-def split_audio(filename: str = 'pipe:',
-                video_data: bytes = None,
+def split_audio(input_fn: str = 'pipe:',
+                input_data: bytes = None,
                 silence_threshold=DEFAULT_SILENCE_THRESHOLD,
                 silence_duration=DEFAULT_SILENCE_DURATION,
                 start_time: float = None,
                 end_time: float = None,
                 verbose=False):
     chunk_times = get_chunk_times(
-        filename,
-        video_data=video_data,
+        input_fn,
+        input_data=input_data,
         silence_threshold=silence_threshold,
         silence_duration=silence_duration,
         start_time=start_time,
         end_time=end_time)
-
     audio_chunks = list()
     for i, (start_time, end_time) in enumerate(chunk_times):
         time = end_time - start_time
-        stream = ffmpeg.input(filename, ss=start_time, t=time)
-        stream = stream.output('pipe:', format='wav')
-        stdout, _ = stream.run(
-            input=video_data, capture_stdout=True, capture_stderr=True)
+        if time < 0:
+            continue
+        input_kwargs = {
+            'ss': start_time,
+            't': time
+        }
+
+        output_kwargs = {
+            'format': 'wav'
+        }
+
+        cmd_args = compile_args(
+            input_fn=input_fn,
+            input_options=input_kwargs,
+            output_options=output_kwargs)
+
+        stdout, stderr = run_command(
+            cmd_args, input=input_data, pipe_stdout=True, pipe_stderr=True)
 
         audio_stream = io.BytesIO(stdout)
         audio_data, sample_rate = sf.read(audio_stream)
diff --git a/gnes/preprocessor/io_utils/ffmpeg.py b/gnes/preprocessor/io_utils/ffmpeg.py
new file mode 100644
index 00000000..677a3a27
--- /dev/null
+++ b/gnes/preprocessor/io_utils/ffmpeg.py
@@ -0,0 +1,140 @@
+#  Tencent is pleased to support the open source community by making GNES available.
+#
+#  Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+#  Licensed under the Apache License, Version 2.0 (the 'License');
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an 'AS IS' BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import re
+from .helper import kwargs_to_cmd_args, run_command, run_command_async
+
+VIDEO_DUR_PATTERN = re.compile(r".*Duration: (\d+):(\d+):(\d+)", re.DOTALL)
+VIDEO_INFO_PATTERN = re.compile(
+    r'.*Stream #0:(\d+)(?:\(\w+\))?: Video: (\w+).*, (yuv\w+)[(,].* (\d+)x(\d+).* (\d+)(\.\d.)? fps',
+    re.DOTALL)
+AUDIO_INFO_PATTERN = re.compile(
+    r'^\s+Stream #0:(?P<stream>\d+)(\((?P<lang>\w+)\))?: Audio: (?P<format>\w+).*?(?P<default>\(default\))?$',
+    re.MULTILINE)
+STREAM_SUBTITLE_PATTERN = re.compile(
+    r'^\s+Stream #0:(?P<stream>\d+)(\((?P<lang>\w+)\))?: Subtitle:',
+    re.MULTILINE)
+
+
+def parse_media_details(infos):
+    video_dur_match = VIDEO_DUR_PATTERN.match(infos)
+    dur_hrs, dur_mins, dur_secs = video_dur_match.group(1, 2, 3)
+
+    video_info_match = VIDEO_INFO_PATTERN.match(infos)
+    codec, pix_fmt, res_width, res_height, fps = video_info_match.group(
+        2, 3, 4, 5, 6)
+
+    audio_tracks = list()
+    for audio_match in AUDIO_INFO_PATTERN.finditer(infos):
+        ainfo = audio_match.groupdict()
+        if ainfo['lang'] is None:
+            ainfo['lang'] = 'und'
+        audio_tracks.append(ainfo)
+
+    medio_info = {
+        'vcodec': codec,
+        'frame_width': int(res_width),
+        'frame_height': int(res_height),
+        'duration': (int(dur_hrs) * 3600 + int(dur_mins) * 60 + int(dur_secs)),
+        'fps': int(fps),
+        'pix_fmt': pix_fmt,
+        'audio': audio_tracks,
+    }
+    return medio_info
+
+
+def compile_args(input_fn: str = 'pipe:',
+                 output_fn: str = 'pipe:',
+                 video_filters: str = [],
+                 audio_filters: str = [],
+                 input_options=dict(),
+                 output_options=dict(),
+                 overwrite_output: bool = True):
+    """Wrapper for various `FFmpeg <https://www.ffmpeg.org/>`_ related applications (ffmpeg,
+    ffprobe).
+    """
+    args = ['ffmpeg']
+
+    input_args = []
+    fmt = input_options.pop('format', None)
+    if fmt:
+        input_args += ['-f', fmt]
+
+    input_args += kwargs_to_cmd_args(input_options)
+    input_args += ['-i', input_fn]
+
+    vf_args = []
+    if len(video_filters) > 0:
+        vf_args = ['-vf', ','.join(video_filters)]
+
+    af_args = []
+    if len(audio_filters) > 0:
+        af_args = ['-af', ','.join(audio_filters)]
+
+    output_args = []
+
+    fmt = output_options.pop('format', None)
+    if fmt:
+        output_args += ['-f', fmt]
+    video_bitrate = output_options.pop('video_bitrate', None)
+    if video_bitrate:
+        output_args += ['-b:v', str(video_bitrate)]
+    audio_bitrate = output_options.pop('audio_bitrate', None)
+    if audio_bitrate:
+        output_args += ['-b:a', str(audio_bitrate)]
+    output_args += kwargs_to_cmd_args(output_options)
+
+    output_args += [output_fn]
+
+    args += input_args + vf_args + af_args + output_args
+
+    if overwrite_output:
+        args += ['-y']
+
+    return args
+
+
+def probe(input_fn: str):
+    command = [
+        'ffprobe', '-v', 'fatal', '-show_entries',
+        'stream=width,height,r_frame_rate,duration', '-of',
+        'default=noprint_wrappers=1:nokey=1', input_fn, '-sexagesimal'
+    ]
+    out, err = run_command(command, pipe_stdout=True, pipe_stderr=True)
+
+    out = out.decode().split('\n')
+    return {
+        'file': input_fn,
+        'width': int(out[0]),
+        'height': int(out[1]),
+        'fps': float(out[2].split('/')[0]) / float(out[2].split('/')[1]),
+        'duration': out[3]
+    }
+
+
+def get_media_meta(input_fn: str = 'pipe:',
+                   input_data: bytes = None,
+                   input_options=dict()):
+    cmd_args = ['ffmpeg']
+
+    fmt = input_options.pop('format', None)
+    if fmt:
+        cmd_args += ['-f', fmt]
+    cmd_args += ['-i', input_fn]
+
+    cmd_args += ['-f', 'ffmetadata', 'pipe:']
+    out, err = run_command(
+        cmd_args, input=input_data, pipe_stdout=True, pipe_stderr=True)
+    return parse_media_details(err.decode())
diff --git a/gnes/preprocessor/io_utils/gif.py b/gnes/preprocessor/io_utils/gif.py
index b841cc9a..a26a1fa5 100644
--- a/gnes/preprocessor/io_utils/gif.py
+++ b/gnes/preprocessor/io_utils/gif.py
@@ -17,8 +17,9 @@
 import numpy as np
 import subprocess as sp
 import tempfile
-import ffmpeg
-from .helper import extract_frame_size
+
+from .ffmpeg import parse_media_details
+
 
 
 def decode_gif(data: bytes, fps: int = -1,
@@ -35,7 +36,9 @@ def decode_gif(data: bytes, fps: int = -1,
 
         out, err = stream.run(capture_stdout=True, capture_stderr=True)
 
-        width, height = extract_frame_size(err.decode())
+        meta_info = parse_media_details(err.decode())
+        width = meta_info['frame_width']
+        height = meta_info['frame_height']
 
         depth = 3
         if pix_fmt == 'rgba':
@@ -43,18 +46,13 @@ def decode_gif(data: bytes, fps: int = -1,
 
         frames = np.frombuffer(out,
                                np.uint8).reshape([-1, height, width, depth])
-        return list(frames)
+        return frames
 
 
 def encode_gif(
-        images: np.ndarray,
-        scale: str,
+        images: List[np.ndarray],
         fps: int,
         pix_fmt: str = 'rgb24'):
-    """
-    https://superuser.com/questions/556029/how-do-i-convert-a-video-to-gif-using-ffmpeg-with-reasonable-quality
-    https://gist.github.com/alexlee-gk/38916bf524dc75ca1b988d113aa30710
-    """
 
     cmd = [
         'ffmpeg', '-y', '-f', 'rawvideo', '-vcodec', 'rawvideo', '-r',
diff --git a/gnes/preprocessor/io_utils/helper.py b/gnes/preprocessor/io_utils/helper.py
index 86c15fa4..97b6e614 100644
--- a/gnes/preprocessor/io_utils/helper.py
+++ b/gnes/preprocessor/io_utils/helper.py
@@ -13,38 +13,60 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-import re
-
-
-# def ffmpeg_probe_pattern():
-#     mediaprobe_re = re.compile(
-#         r"Duration:\s+(?P<dur>(?:(?:\d:?)+[.]?\d*)|N/A)(?:.+start:\s+(?P<start>\d+[.]\d+))?.+bitrate:\s+(?P<bitrate>(?:\d+\s*..[/]s)|N/A)"
-#     )
-#     streamprobe_re = re.compile(
-#         r"\s*Stream.+:\s+Video:.+\s+(?P<res>\d+x\d+)(?:.*,\s*(?P<fps>\d+[.]?\d*)\sfps)?(?:.+\(default\))?"
-#     )
-#     audioprobe_re = re.compile(r"\s*Stream.+:\s+Audio:.*")
-#     fftime_re = re.compile(r"(?P<h>\d+):(?P<m>\d+):(?P<s>\d+)\.(?P<fract>\d+)")
-
-
-def extract_frame_size(ffmpeg_parse_info: str):
-    """
-    The sollution is borrowed from:
-    http://concisionandconcinnity.blogspot.com/2008/04/getting-dimensions-of-video-file-in.html
-    """
-    possible_patterns = [
-        re.compile(r'Stream.*Video.*([0-9]{4,})x([0-9]{4,})'),
-        re.compile(r'Stream.*Video.*([0-9]{4,})x([0-9]{3,})'),
-        re.compile(r'Stream.*Video.*([0-9]{3,})x([0-9]{3,})')
-    ]
-
-    for pattern in possible_patterns:
-        match = pattern.search(ffmpeg_parse_info)
-        if match is not None:
-            x, y = map(int, match.groups()[0:2])
+import subprocess as sp
+
+
+def kwargs_to_cmd_args(kwargs):
+    args = []
+    for k, v in kwargs.items():
+        args.append('-%s' % k)
+        if v is not None:
+            args.append('%s' % str(v))
+    return args
+
+
+def run_command_async(cmd_args,
+                      pipe_stdin=True,
+                      pipe_stdout=False,
+                      pipe_stderr=False,
+                      quiet=False):
+    stdin_stream = sp.PIPE if pipe_stdin else None
+    stdout_stream = sp.PIPE if pipe_stdout or quiet else None
+    stderr_stream = sp.PIPE if pipe_stderr or quiet else None
+
+    return sp.Popen(
+        cmd_args,
+        stdin=stdin_stream,
+        stdout=stdout_stream,
+        stderr=stderr_stream)
+
+
+def wait(process):
+    while True:
+        output = process.stdout.readline()
+        if output == '' and process.poll() is not None:
             break
+        if output:
+            print(output.strip())
+    rc = process.poll()
+    return (output, rc)
+
 
-    if match is None:
-        raise ValueError("could not get video frame size")
+def run_command(cmd_args,
+                input=None,
+                pipe_stdin=True,
+                pipe_stdout=False,
+                pipe_stderr=False,
+                quiet=False):
+    with run_command_async(
+            cmd_args,
+            pipe_stdin=pipe_stdin,
+            pipe_stdout=pipe_stdout,
+            pipe_stderr=pipe_stderr,
+            quiet=quiet) as proc:
+        stdout, stderr = proc.communicate(input)
+        retcode = proc.poll()
 
-    return (x, y)
+        if retcode:
+            raise Exception('ffmpeg error: %s' % stderr)
+        return stdout, stderr
diff --git a/gnes/preprocessor/io_utils/video.py b/gnes/preprocessor/io_utils/video.py
index 21238dea..3fa0f2da 100644
--- a/gnes/preprocessor/io_utils/video.py
+++ b/gnes/preprocessor/io_utils/video.py
@@ -13,17 +13,19 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import io
 import ffmpeg
 import numpy as np
 
 from typing import List
 
-from .helper import extract_frame_size
+from .ffmpeg import parse_media_details, compile_args
+from .helper import run_command, run_command_async
 
 
-def scale_video(input_filename: str = 'pipe:',
-                output_filename: str = 'pipe:',
-                video_data: bytes = None,
+def scale_video(input_fn: str = 'pipe:',
+                output_fn: str = 'pipe:',
+                input_data: bytes = None,
                 start_time: float = None,
                 end_time: float = None,
                 scale: str = None,
@@ -33,12 +35,12 @@ def scale_video(input_filename: str = 'pipe:',
                 format: str = 'mp4',
                 pix_fmt: str = 'yuv420p',
                 **kwargs):
-    capture_stdin = (input_filename == 'pipe:')
-    if capture_stdin and video_data is None:
+    capture_stdin = (input_fn == 'pipe:')
+    if capture_stdin and input_data is None:
         raise ValueError(
             "the buffered video data for stdin should not be empty")
 
-    capture_stdout = (output_filename == 'pipe:')
+    capture_stdout = (output_fn == 'pipe:')
 
     input_kwargs = {}
     if start_time is not None:
@@ -48,8 +50,6 @@ def scale_video(input_filename: str = 'pipe:',
     if end_time is not None:
         input_kwargs['t'] = end_time - start_time
 
-    stream = ffmpeg.input(input_filename, **input_kwargs)
-
     out_kwargs = {
         'vcodec': vcodec,
         'pix_fmt': pix_fmt,
@@ -65,65 +65,84 @@ def scale_video(input_filename: str = 'pipe:',
         # an empty moov means it doesn't need to seek and thus works with a pipe.
         out_kwargs['movflags'] = 'frag_keyframe+empty_moov'
 
-    stream = stream.output(output_filename, **out_kwargs).overwrite_output()
-    stdout, stderr = stream.run(
-        input=video_data if capture_stdin else None,
-        capture_stdout=capture_stdout)
+    cmd_args = compile_args(
+        input_fn=input_fn,
+        output_fn=output_fn,
+        input_options=input_kwargs,
+        output_options=out_kwargs,
+        overwrite_output=True)
+    stdout, stderr = run_command(
+        cmd_args, input=input_data, pipe_stdout=True, pipe_stderr=True)
+
     if capture_stdout:
         return stdout
     return None
 
 
-def encode_video(filename: str,
-                 images: List['np.ndarray'],
+def encode_video(images: List['np.ndarray'],
+                 pix_fmt: str = 'rgb24',
                  frame_rate: int = 15,
+                 output_fn: str = 'pipe:',
                  vcodec: str = 'libx264',
+                 format: str = 'mp4',
                  **kwargs):
     packet_size = 4096
 
     n = len(images)
     height, width, channels = images[0].shape
 
-    capture_stdout = (filename == 'pipe:')
-    process = ffmpeg.input(
-        'pipe:',
-        framerate=frame_rate,
-        format='rawvideo',
-        pix_fmt='rgb24',
-        s='{}x{}'.format(width, height)).output(
-            filename, pix_fmt='yuv420p', vcodec=vcodec,
-            r=frame_rate).overwrite_output().run_async(
-                pipe_stdin=True, pipe_stdout=capture_stdout)
-    for frame in images:
-        process.stdin.write(frame.astype(np.uint8).tobytes())
-    process.stdin.close()
-
-    output = None
-    if capture_stdout:
-        stream = io.BytesIO(b'')
-        while True:
-            in_bytes = process.stdout.read(packet_size)
-            if not in_bytes:
-                process.stdout.close()
+    capture_stdout = (output_fn == 'pipe:')
+
+    input_kwargs = {
+        'format': 'rawvideo',
+        'pix_fmt': pix_fmt,
+        'framerate': frame_rate,
+        's': '{}x{}'.format(width, height),
+    }
+
+    output_kwargs = {
+        'vcodec': vcodec,
+        'r': frame_rate,
+        'pix_fmt': 'yuv420p',
+        'format': format,
+        'movflags': 'frag_keyframe+empty_moov',
+    }
+
+    cmd_args = compile_args(
+        input_fn='pipe:',
+        output_fn=output_fn,
+        input_options=input_kwargs,
+        output_options=output_kwargs)
+
+    with run_command_async(
+            cmd_args,
+            pipe_stdin=True,
+            pipe_stdout=capture_stdout,
+            pipe_stderr=True) as proc:
+
+        input_stream = io.BytesIO(b'')
+        for frame in images:
+            input_stream.write(frame.astype(np.uint8).tobytes())
 
-                break
-            stream.write(in_bytes)
+        output, err = proc.communicate(input_stream.getvalue())
 
-        output = stream.getvalue()
-    process.wait()
-    return output
+        if proc.returncode:
+            err = '\n'.join([' '.join(cmd_args), err.decode('utf8')])
+            raise IOError(err)
 
+        return output
 
-def capture_frames(filename: str = 'pipe:',
-                   video_data: bytes = None,
+
+def capture_frames(input_fn: str = 'pipe:',
+                   input_data: bytes = None,
                    pix_fmt: str = 'rgb24',
                    fps: int = -1,
                    scale: str = None,
                    start_time: float = None,
                    end_time: float = None,
                    **kwargs) -> List['np.ndarray']:
-    capture_stdin = (filename == 'pipe:')
-    if capture_stdin and video_data is None:
+    capture_stdin = (input_fn == 'pipe:')
+    if capture_stdin and input_data is None:
         raise ValueError(
             "the buffered video data for stdin should not be empty")
 
@@ -132,35 +151,47 @@ def capture_frames(filename: str = 'pipe:',
         'fflags': 'discardcorrupt'    # discard corrupted frames
     }
     if start_time is not None:
-        input_kwargs['ss'] = start_time
+        input_kwargs['ss'] = str(start_time)
     else:
         start_time = 0.
     if end_time is not None:
-        input_kwargs['t'] = end_time - start_time
-
-    stream = ffmpeg.input(filename, **input_kwargs)
-    if fps > 0:
-        stream = stream.filter('fps', fps=fps, round='up')
+        input_kwargs['t'] = str(end_time - start_time)
 
+    video_filters = []
+    if fps:
+        video_filters += ['fps=%d' % fps]
     if scale:
-        width, height = map(int, scale.split(':'))
-        stream = stream.filter('scale', width, height)
+        video_filters += ['scale=%s' % scale]
+
+    output_kwargs = {
+        'format': 'image2pipe',
+        'pix_fmt': pix_fmt,
+        'vcodec': 'rawvideo'
+    }
 
-    stream = stream.output('pipe:', format='rawvideo', pix_fmt=pix_fmt)
+    cmd_args = compile_args(
+        input_fn=input_fn,
+        input_options=input_kwargs,
+        video_filters=video_filters,
+        output_options=output_kwargs)
 
-    out, err = stream.run(
-        input=video_data, capture_stdout=True, capture_stderr=True)
+    out, err = run_command(
+        cmd_args, input=input_data, pipe_stdout=True, pipe_stderr=True)
 
-    if not scale:
-        width, height = extract_frame_size(err.decode())
+    if scale:
+        width, height = map(int, scale.split(':'))
+    else:
+        meta_info = parse_media_details(err.decode())
+        width = meta_info['frame_width']
+        height = meta_info['frame_height']
 
     depth = 3
     if pix_fmt == 'rgba':
         depth = 4
 
-    frames = np.frombuffer(out, np.uint8).reshape(
-        [-1, int(height), int(width), depth])
-    return list(frames)
+    frames = np.frombuffer(out, np.uint8).reshape([-1, height, width, depth])
+    return frames
+
 
 # def read_frame_as_jpg(in_filename, frame_num):
 #     out, err = (