diff --git a/gnes/preprocessor/io_utils/audio.py b/gnes/preprocessor/io_utils/audio.py index 7828715b..c39acf48 100644 --- a/gnes/preprocessor/io_utils/audio.py +++ b/gnes/preprocessor/io_utils/audio.py @@ -19,33 +19,51 @@ import numpy as np import soundfile as sf +from .ffmpeg import compile_args +from .helper import run_command, run_command_async + from typing import List DEFAULT_SILENCE_DURATION = 0.3 DEFAULT_SILENCE_THRESHOLD = -60 -def capture_audio(filename: str = 'pipe:', - video_data: bytes = None, +def capture_audio(input_fn: str = 'pipe:', + input_data: bytes = None, bits_per_raw_sample: int = 16, sample_rate: int = 16000, + start_time: float = None, + end_time: float = None, **kwargs) -> List['np.ndarray']: - capture_stdin = (filename == 'pipe:') - if capture_stdin and video_data is None: + capture_stdin = (input_fn == 'pipe:') + if capture_stdin and input_data is None: raise ValueError( "the buffered video data for stdin should not be empty") - stream = ffmpeg.input(filename) - stream = stream.output( - 'pipe:', - format='wav', - bits_per_raw_sample=bits_per_raw_sample, - ac=1, - ar=16000) + input_kwargs = {} + if start_time is not None: + input_kwargs['ss'] = str(start_time) + else: + start_time = 0. + if end_time is not None: + input_kwargs['t'] = str(end_time - start_time) + + output_kwargs = { + 'format': 'wav', + 'bits_per_raw_sample': bits_per_raw_sample, + 'ac': 1, + 'ar': sample_rate + } + + cmd_args = compile_args( + input_fn=input_fn, + input_options=input_kwargs, + output_options=output_kwargs, + overwrite_output=True) - stdout, _ = stream.run( - input=video_data, capture_stdout=True, capture_stderr=True) + stdout, _ = run_command( + cmd_args, input=input_data, pipe_stdout=True, pipe_stderr=True) audio_stream = io.BytesIO(stdout) audio_data, sample_rate = sf.read(audio_stream) @@ -56,8 +74,8 @@ def capture_audio(filename: str = 'pipe:', return audio_data -def get_chunk_times(filename: str = 'pipe:', - video_data: bytes = None, +def get_chunk_times(input_fn: str = 'pipe:', + input_data: bytes = None, silence_threshold: float = DEFAULT_SILENCE_THRESHOLD, silence_duration: float = DEFAULT_SILENCE_DURATION, start_time: float = None, @@ -78,15 +96,20 @@ def get_chunk_times(filename: str = 'pipe:', if end_time is not None: input_kwargs['t'] = end_time - start_time - stream = ffmpeg.input(filename, **input_kwargs) - stream = stream.filter( - 'silencedetect', - n='{}dB'.format(silence_threshold), - d=silence_duration) - stream = stream.output('pipe:', format='null') + au_filters = [ + 'silencedetect=noise={}dB:d={}'.format(silence_threshold, + silence_duration) + ] - stdout, stderr = stream.run( - input=video_data, capture_stdout=True, capture_stderr=True) + output_kwargs = {'format': 'null'} + cmd_args = compile_args( + input_fn=input_fn, + input_options=input_kwargs, + audio_filters=au_filters, + output_options=output_kwargs) + + stdout, stderr = run_command( + cmd_args, input=input_data, pipe_stdout=True, pipe_stderr=True) lines = stderr.decode().splitlines() @@ -121,28 +144,41 @@ def get_chunk_times(filename: str = 'pipe:', return list(zip(chunk_starts, chunk_ends)) -def split_audio(filename: str = 'pipe:', - video_data: bytes = None, +def split_audio(input_fn: str = 'pipe:', + input_data: bytes = None, silence_threshold=DEFAULT_SILENCE_THRESHOLD, silence_duration=DEFAULT_SILENCE_DURATION, start_time: float = None, end_time: float = None, verbose=False): chunk_times = get_chunk_times( - filename, - video_data=video_data, + input_fn, + input_data=input_data, silence_threshold=silence_threshold, silence_duration=silence_duration, start_time=start_time, end_time=end_time) - audio_chunks = list() for i, (start_time, end_time) in enumerate(chunk_times): time = end_time - start_time - stream = ffmpeg.input(filename, ss=start_time, t=time) - stream = stream.output('pipe:', format='wav') - stdout, _ = stream.run( - input=video_data, capture_stdout=True, capture_stderr=True) + if time < 0: + continue + input_kwargs = { + 'ss': start_time, + 't': time + } + + output_kwargs = { + 'format': 'wav' + } + + cmd_args = compile_args( + input_fn=input_fn, + input_options=input_kwargs, + output_options=output_kwargs) + + stdout, stderr = run_command( + cmd_args, input=input_data, pipe_stdout=True, pipe_stderr=True) audio_stream = io.BytesIO(stdout) audio_data, sample_rate = sf.read(audio_stream) diff --git a/gnes/preprocessor/io_utils/ffmpeg.py b/gnes/preprocessor/io_utils/ffmpeg.py new file mode 100644 index 00000000..677a3a27 --- /dev/null +++ b/gnes/preprocessor/io_utils/ffmpeg.py @@ -0,0 +1,140 @@ +# Tencent is pleased to support the open source community by making GNES available. +# +# Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from .helper import kwargs_to_cmd_args, run_command, run_command_async + +VIDEO_DUR_PATTERN = re.compile(r".*Duration: (\d+):(\d+):(\d+)", re.DOTALL) +VIDEO_INFO_PATTERN = re.compile( + r'.*Stream #0:(\d+)(?:\(\w+\))?: Video: (\w+).*, (yuv\w+)[(,].* (\d+)x(\d+).* (\d+)(\.\d.)? fps', + re.DOTALL) +AUDIO_INFO_PATTERN = re.compile( + r'^\s+Stream #0:(?P\d+)(\((?P\w+)\))?: Audio: (?P\w+).*?(?P\(default\))?$', + re.MULTILINE) +STREAM_SUBTITLE_PATTERN = re.compile( + r'^\s+Stream #0:(?P\d+)(\((?P\w+)\))?: Subtitle:', + re.MULTILINE) + + +def parse_media_details(infos): + video_dur_match = VIDEO_DUR_PATTERN.match(infos) + dur_hrs, dur_mins, dur_secs = video_dur_match.group(1, 2, 3) + + video_info_match = VIDEO_INFO_PATTERN.match(infos) + codec, pix_fmt, res_width, res_height, fps = video_info_match.group( + 2, 3, 4, 5, 6) + + audio_tracks = list() + for audio_match in AUDIO_INFO_PATTERN.finditer(infos): + ainfo = audio_match.groupdict() + if ainfo['lang'] is None: + ainfo['lang'] = 'und' + audio_tracks.append(ainfo) + + medio_info = { + 'vcodec': codec, + 'frame_width': int(res_width), + 'frame_height': int(res_height), + 'duration': (int(dur_hrs) * 3600 + int(dur_mins) * 60 + int(dur_secs)), + 'fps': int(fps), + 'pix_fmt': pix_fmt, + 'audio': audio_tracks, + } + return medio_info + + +def compile_args(input_fn: str = 'pipe:', + output_fn: str = 'pipe:', + video_filters: str = [], + audio_filters: str = [], + input_options=dict(), + output_options=dict(), + overwrite_output: bool = True): + """Wrapper for various `FFmpeg `_ related applications (ffmpeg, + ffprobe). + """ + args = ['ffmpeg'] + + input_args = [] + fmt = input_options.pop('format', None) + if fmt: + input_args += ['-f', fmt] + + input_args += kwargs_to_cmd_args(input_options) + input_args += ['-i', input_fn] + + vf_args = [] + if len(video_filters) > 0: + vf_args = ['-vf', ','.join(video_filters)] + + af_args = [] + if len(audio_filters) > 0: + af_args = ['-af', ','.join(audio_filters)] + + output_args = [] + + fmt = output_options.pop('format', None) + if fmt: + output_args += ['-f', fmt] + video_bitrate = output_options.pop('video_bitrate', None) + if video_bitrate: + output_args += ['-b:v', str(video_bitrate)] + audio_bitrate = output_options.pop('audio_bitrate', None) + if audio_bitrate: + output_args += ['-b:a', str(audio_bitrate)] + output_args += kwargs_to_cmd_args(output_options) + + output_args += [output_fn] + + args += input_args + vf_args + af_args + output_args + + if overwrite_output: + args += ['-y'] + + return args + + +def probe(input_fn: str): + command = [ + 'ffprobe', '-v', 'fatal', '-show_entries', + 'stream=width,height,r_frame_rate,duration', '-of', + 'default=noprint_wrappers=1:nokey=1', input_fn, '-sexagesimal' + ] + out, err = run_command(command, pipe_stdout=True, pipe_stderr=True) + + out = out.decode().split('\n') + return { + 'file': input_fn, + 'width': int(out[0]), + 'height': int(out[1]), + 'fps': float(out[2].split('/')[0]) / float(out[2].split('/')[1]), + 'duration': out[3] + } + + +def get_media_meta(input_fn: str = 'pipe:', + input_data: bytes = None, + input_options=dict()): + cmd_args = ['ffmpeg'] + + fmt = input_options.pop('format', None) + if fmt: + cmd_args += ['-f', fmt] + cmd_args += ['-i', input_fn] + + cmd_args += ['-f', 'ffmetadata', 'pipe:'] + out, err = run_command( + cmd_args, input=input_data, pipe_stdout=True, pipe_stderr=True) + return parse_media_details(err.decode()) diff --git a/gnes/preprocessor/io_utils/gif.py b/gnes/preprocessor/io_utils/gif.py index b841cc9a..a26a1fa5 100644 --- a/gnes/preprocessor/io_utils/gif.py +++ b/gnes/preprocessor/io_utils/gif.py @@ -17,8 +17,9 @@ import numpy as np import subprocess as sp import tempfile -import ffmpeg -from .helper import extract_frame_size + +from .ffmpeg import parse_media_details + def decode_gif(data: bytes, fps: int = -1, @@ -35,7 +36,9 @@ def decode_gif(data: bytes, fps: int = -1, out, err = stream.run(capture_stdout=True, capture_stderr=True) - width, height = extract_frame_size(err.decode()) + meta_info = parse_media_details(err.decode()) + width = meta_info['frame_width'] + height = meta_info['frame_height'] depth = 3 if pix_fmt == 'rgba': @@ -43,18 +46,13 @@ def decode_gif(data: bytes, fps: int = -1, frames = np.frombuffer(out, np.uint8).reshape([-1, height, width, depth]) - return list(frames) + return frames def encode_gif( - images: np.ndarray, - scale: str, + images: List[np.ndarray], fps: int, pix_fmt: str = 'rgb24'): - """ - https://superuser.com/questions/556029/how-do-i-convert-a-video-to-gif-using-ffmpeg-with-reasonable-quality - https://gist.github.com/alexlee-gk/38916bf524dc75ca1b988d113aa30710 - """ cmd = [ 'ffmpeg', '-y', '-f', 'rawvideo', '-vcodec', 'rawvideo', '-r', diff --git a/gnes/preprocessor/io_utils/helper.py b/gnes/preprocessor/io_utils/helper.py index 86c15fa4..97b6e614 100644 --- a/gnes/preprocessor/io_utils/helper.py +++ b/gnes/preprocessor/io_utils/helper.py @@ -13,38 +13,60 @@ # See the License for the specific language governing permissions and # limitations under the License. -import re - - -# def ffmpeg_probe_pattern(): -# mediaprobe_re = re.compile( -# r"Duration:\s+(?P(?:(?:\d:?)+[.]?\d*)|N/A)(?:.+start:\s+(?P\d+[.]\d+))?.+bitrate:\s+(?P(?:\d+\s*..[/]s)|N/A)" -# ) -# streamprobe_re = re.compile( -# r"\s*Stream.+:\s+Video:.+\s+(?P\d+x\d+)(?:.*,\s*(?P\d+[.]?\d*)\sfps)?(?:.+\(default\))?" -# ) -# audioprobe_re = re.compile(r"\s*Stream.+:\s+Audio:.*") -# fftime_re = re.compile(r"(?P\d+):(?P\d+):(?P\d+)\.(?P\d+)") - - -def extract_frame_size(ffmpeg_parse_info: str): - """ - The sollution is borrowed from: - http://concisionandconcinnity.blogspot.com/2008/04/getting-dimensions-of-video-file-in.html - """ - possible_patterns = [ - re.compile(r'Stream.*Video.*([0-9]{4,})x([0-9]{4,})'), - re.compile(r'Stream.*Video.*([0-9]{4,})x([0-9]{3,})'), - re.compile(r'Stream.*Video.*([0-9]{3,})x([0-9]{3,})') - ] - - for pattern in possible_patterns: - match = pattern.search(ffmpeg_parse_info) - if match is not None: - x, y = map(int, match.groups()[0:2]) +import subprocess as sp + + +def kwargs_to_cmd_args(kwargs): + args = [] + for k, v in kwargs.items(): + args.append('-%s' % k) + if v is not None: + args.append('%s' % str(v)) + return args + + +def run_command_async(cmd_args, + pipe_stdin=True, + pipe_stdout=False, + pipe_stderr=False, + quiet=False): + stdin_stream = sp.PIPE if pipe_stdin else None + stdout_stream = sp.PIPE if pipe_stdout or quiet else None + stderr_stream = sp.PIPE if pipe_stderr or quiet else None + + return sp.Popen( + cmd_args, + stdin=stdin_stream, + stdout=stdout_stream, + stderr=stderr_stream) + + +def wait(process): + while True: + output = process.stdout.readline() + if output == '' and process.poll() is not None: break + if output: + print(output.strip()) + rc = process.poll() + return (output, rc) + - if match is None: - raise ValueError("could not get video frame size") +def run_command(cmd_args, + input=None, + pipe_stdin=True, + pipe_stdout=False, + pipe_stderr=False, + quiet=False): + with run_command_async( + cmd_args, + pipe_stdin=pipe_stdin, + pipe_stdout=pipe_stdout, + pipe_stderr=pipe_stderr, + quiet=quiet) as proc: + stdout, stderr = proc.communicate(input) + retcode = proc.poll() - return (x, y) + if retcode: + raise Exception('ffmpeg error: %s' % stderr) + return stdout, stderr diff --git a/gnes/preprocessor/io_utils/video.py b/gnes/preprocessor/io_utils/video.py index 21238dea..3fa0f2da 100644 --- a/gnes/preprocessor/io_utils/video.py +++ b/gnes/preprocessor/io_utils/video.py @@ -13,17 +13,19 @@ # See the License for the specific language governing permissions and # limitations under the License. +import io import ffmpeg import numpy as np from typing import List -from .helper import extract_frame_size +from .ffmpeg import parse_media_details, compile_args +from .helper import run_command, run_command_async -def scale_video(input_filename: str = 'pipe:', - output_filename: str = 'pipe:', - video_data: bytes = None, +def scale_video(input_fn: str = 'pipe:', + output_fn: str = 'pipe:', + input_data: bytes = None, start_time: float = None, end_time: float = None, scale: str = None, @@ -33,12 +35,12 @@ def scale_video(input_filename: str = 'pipe:', format: str = 'mp4', pix_fmt: str = 'yuv420p', **kwargs): - capture_stdin = (input_filename == 'pipe:') - if capture_stdin and video_data is None: + capture_stdin = (input_fn == 'pipe:') + if capture_stdin and input_data is None: raise ValueError( "the buffered video data for stdin should not be empty") - capture_stdout = (output_filename == 'pipe:') + capture_stdout = (output_fn == 'pipe:') input_kwargs = {} if start_time is not None: @@ -48,8 +50,6 @@ def scale_video(input_filename: str = 'pipe:', if end_time is not None: input_kwargs['t'] = end_time - start_time - stream = ffmpeg.input(input_filename, **input_kwargs) - out_kwargs = { 'vcodec': vcodec, 'pix_fmt': pix_fmt, @@ -65,65 +65,84 @@ def scale_video(input_filename: str = 'pipe:', # an empty moov means it doesn't need to seek and thus works with a pipe. out_kwargs['movflags'] = 'frag_keyframe+empty_moov' - stream = stream.output(output_filename, **out_kwargs).overwrite_output() - stdout, stderr = stream.run( - input=video_data if capture_stdin else None, - capture_stdout=capture_stdout) + cmd_args = compile_args( + input_fn=input_fn, + output_fn=output_fn, + input_options=input_kwargs, + output_options=out_kwargs, + overwrite_output=True) + stdout, stderr = run_command( + cmd_args, input=input_data, pipe_stdout=True, pipe_stderr=True) + if capture_stdout: return stdout return None -def encode_video(filename: str, - images: List['np.ndarray'], +def encode_video(images: List['np.ndarray'], + pix_fmt: str = 'rgb24', frame_rate: int = 15, + output_fn: str = 'pipe:', vcodec: str = 'libx264', + format: str = 'mp4', **kwargs): packet_size = 4096 n = len(images) height, width, channels = images[0].shape - capture_stdout = (filename == 'pipe:') - process = ffmpeg.input( - 'pipe:', - framerate=frame_rate, - format='rawvideo', - pix_fmt='rgb24', - s='{}x{}'.format(width, height)).output( - filename, pix_fmt='yuv420p', vcodec=vcodec, - r=frame_rate).overwrite_output().run_async( - pipe_stdin=True, pipe_stdout=capture_stdout) - for frame in images: - process.stdin.write(frame.astype(np.uint8).tobytes()) - process.stdin.close() - - output = None - if capture_stdout: - stream = io.BytesIO(b'') - while True: - in_bytes = process.stdout.read(packet_size) - if not in_bytes: - process.stdout.close() + capture_stdout = (output_fn == 'pipe:') + + input_kwargs = { + 'format': 'rawvideo', + 'pix_fmt': pix_fmt, + 'framerate': frame_rate, + 's': '{}x{}'.format(width, height), + } + + output_kwargs = { + 'vcodec': vcodec, + 'r': frame_rate, + 'pix_fmt': 'yuv420p', + 'format': format, + 'movflags': 'frag_keyframe+empty_moov', + } + + cmd_args = compile_args( + input_fn='pipe:', + output_fn=output_fn, + input_options=input_kwargs, + output_options=output_kwargs) + + with run_command_async( + cmd_args, + pipe_stdin=True, + pipe_stdout=capture_stdout, + pipe_stderr=True) as proc: + + input_stream = io.BytesIO(b'') + for frame in images: + input_stream.write(frame.astype(np.uint8).tobytes()) - break - stream.write(in_bytes) + output, err = proc.communicate(input_stream.getvalue()) - output = stream.getvalue() - process.wait() - return output + if proc.returncode: + err = '\n'.join([' '.join(cmd_args), err.decode('utf8')]) + raise IOError(err) + return output -def capture_frames(filename: str = 'pipe:', - video_data: bytes = None, + +def capture_frames(input_fn: str = 'pipe:', + input_data: bytes = None, pix_fmt: str = 'rgb24', fps: int = -1, scale: str = None, start_time: float = None, end_time: float = None, **kwargs) -> List['np.ndarray']: - capture_stdin = (filename == 'pipe:') - if capture_stdin and video_data is None: + capture_stdin = (input_fn == 'pipe:') + if capture_stdin and input_data is None: raise ValueError( "the buffered video data for stdin should not be empty") @@ -132,35 +151,47 @@ def capture_frames(filename: str = 'pipe:', 'fflags': 'discardcorrupt' # discard corrupted frames } if start_time is not None: - input_kwargs['ss'] = start_time + input_kwargs['ss'] = str(start_time) else: start_time = 0. if end_time is not None: - input_kwargs['t'] = end_time - start_time - - stream = ffmpeg.input(filename, **input_kwargs) - if fps > 0: - stream = stream.filter('fps', fps=fps, round='up') + input_kwargs['t'] = str(end_time - start_time) + video_filters = [] + if fps: + video_filters += ['fps=%d' % fps] if scale: - width, height = map(int, scale.split(':')) - stream = stream.filter('scale', width, height) + video_filters += ['scale=%s' % scale] + + output_kwargs = { + 'format': 'image2pipe', + 'pix_fmt': pix_fmt, + 'vcodec': 'rawvideo' + } - stream = stream.output('pipe:', format='rawvideo', pix_fmt=pix_fmt) + cmd_args = compile_args( + input_fn=input_fn, + input_options=input_kwargs, + video_filters=video_filters, + output_options=output_kwargs) - out, err = stream.run( - input=video_data, capture_stdout=True, capture_stderr=True) + out, err = run_command( + cmd_args, input=input_data, pipe_stdout=True, pipe_stderr=True) - if not scale: - width, height = extract_frame_size(err.decode()) + if scale: + width, height = map(int, scale.split(':')) + else: + meta_info = parse_media_details(err.decode()) + width = meta_info['frame_width'] + height = meta_info['frame_height'] depth = 3 if pix_fmt == 'rgba': depth = 4 - frames = np.frombuffer(out, np.uint8).reshape( - [-1, int(height), int(width), depth]) - return list(frames) + frames = np.frombuffer(out, np.uint8).reshape([-1, height, width, depth]) + return frames + # def read_frame_as_jpg(in_filename, frame_num): # out, err = (