diff --git a/gnes/preprocessor/io_utils/video.py b/gnes/preprocessor/io_utils/video.py
index 5cdb16c4..1d2b4eeb 100644
--- a/gnes/preprocessor/io_utils/video.py
+++ b/gnes/preprocessor/io_utils/video.py
@@ -18,7 +18,7 @@
 
 from typing import List
 
-from .ffmpeg import get_media_meta, compile_args
+from .ffmpeg import get_media_meta, compile_args, probe
 from .helper import _check_input, run_command, run_command_async
 
 
@@ -53,7 +53,7 @@ def scale_video(input_fn: str = 'pipe:',
         'crf': crf,
         'framerate': frame_rate,
         'acodec': 'aac',
-        'strict': 'experimental',  # AAC audio encoder is experimental
+        'strict': 'experimental',    # AAC audio encoder is experimental
     }
 
     if scale:
@@ -142,71 +142,78 @@ def capture_frames(input_fn: str = 'pipe:',
                    **kwargs) -> List['np.ndarray']:
     _check_input(input_fn, input_data)
 
-    video_meta = get_media_meta(input_fn=input_fn, input_data=input_data)
-    width = video_meta['frame_width']
-    height = video_meta['frame_height']
-
-    if scale is not None:
-        _width, _height = map(int, scale.split(':'))
-        if _width * _height < 0:
-            if _width > 0:
-                ratio = _width / width
-                height = int(ratio * height)
-                if _height == -2:
-                    height += height % 2
-                width = _width
+    import tempfile
+
+    with tempfile.NamedTemporaryFile() as f:
+        if input_data:
+            f.write(input_data)
+            f.flush()
+            input_fn = f.name
+
+        video_meta = probe(input_fn)
+        width = video_meta['width']
+        height = video_meta['height']
+
+        if scale is not None:
+            _width, _height = map(int, scale.split(':'))
+            if _width * _height < 0:
+                if _width > 0:
+                    ratio = _width / width
+                    height = int(ratio * height)
+                    if _height == -2:
+                        height += height % 2
+                    width = _width
+                else:
+                    ratio = _height / height
+                    width = int(ratio * width)
+                    if _width == -2:
+                        width += width % 2
+
+                    height = _height
+
+                scale = '%d:%d' % (width, height)
             else:
-                ratio = _height / height
-                width = int(ratio * width)
-                if _width == -2:
-                    width += width % 2
-
+                width = _width
                 height = _height
 
-            scale = '%d:%d' % (width, height)
+        input_kwargs = {
+            'err_detect': 'aggressive',
+            'fflags': 'discardcorrupt'    # discard corrupted frames
+        }
+        if start_time is not None:
+            input_kwargs['ss'] = str(start_time)
         else:
-            width = _width
-            height = _height
-
-    input_kwargs = {
-        'err_detect': 'aggressive',
-        'fflags': 'discardcorrupt'    # discard corrupted frames
-    }
-    if start_time is not None:
-        input_kwargs['ss'] = str(start_time)
-    else:
-        start_time = 0.
-    if end_time is not None:
-        input_kwargs['t'] = str(end_time - start_time)
-
-    video_filters = []
-    if fps:
-        video_filters += ['fps=%d' % fps]
-    if scale:
-        video_filters += ['scale=%s' % scale]
-
-    output_kwargs = {
-        'format': 'image2pipe',
-        'pix_fmt': pix_fmt,
-        'vcodec': 'rawvideo',
-        'movflags': 'frag_keyframe+empty_moov',
-    }
-
-    cmd_args = compile_args(
-        input_fn=input_fn,
-        input_options=input_kwargs,
-        video_filters=video_filters,
-        output_options=output_kwargs)
-
-    out, _ = run_command(
-        cmd_args, input=input_data, pipe_stdout=True, pipe_stderr=True)
-
-    depth = 3
-    if pix_fmt == 'rgba':
-        depth = 4
-
-    frames = np.frombuffer(out, np.uint8).reshape([-1, height, width, depth])
-    return frames
+            start_time = 0.
+        if end_time is not None:
+            input_kwargs['t'] = str(end_time - start_time)
+
+        video_filters = []
+        if fps:
+            video_filters += ['fps=%d' % fps]
+        if scale:
+            video_filters += ['scale=%s' % scale]
+
+        output_kwargs = {
+            'format': 'image2pipe',
+            'pix_fmt': pix_fmt,
+            'vcodec': 'rawvideo',
+            'movflags': 'faststart',
+        }
+
+        cmd_args = compile_args(
+            input_fn=input_fn,
+            input_options=input_kwargs,
+            video_filters=video_filters,
+            output_options=output_kwargs)
+        out, _ = run_command(cmd_args, pipe_stdout=True, pipe_stderr=True)
+
+        depth = 3
+        if pix_fmt == 'rgba':
+            depth = 4
+
+        frames = np.frombuffer(out,
+                               np.uint8).reshape([-1, height, width, depth])
+        return frames
 
 
 # def read_frame_as_jpg(in_filename, frame_num):