diff --git a/docs/changelog.rst b/docs/changelog.rst index 74e79dd5..cc8c053d 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,8 +1,20 @@ Changelog --------- +2.2.13 +^^^^^^ +- Mid-row codes only add spaces only if there isn't one before. +- Mid-row codes add spaces only if they affect the text in the same row (not adding if it follows break or PACS). +- Remove spaces to the end of the lines. +- Close italics on receiving another style setting command. +- Throw an CaptionReadNoCaptions error in case of empty input file are provided. +- Ignore repositioning commands which are not followed by any text before breaks. +- Mid-row codes will not add the space if it is in front of punctuation. +- Fix a bug with background codes when the InstructionNodeCreator collection is empty. +- Fix a bug WebVTT writer adding double line breaks. + 2.2.12 ^^^^^^ -- Pinned nltk version to 3.8.0 +- Pinned nltk to 3.8.0 2.2.11 ^^^^^^ diff --git a/docs/conf.py b/docs/conf.py index 9b455abf..36146434 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -53,9 +53,9 @@ # built documents. # # The short X.Y version. -version = '2.2.11' +version = '2.2.12.dev2' # The full version, including alpha/beta/rc tags. -release = '2.2.11' +release = '2.2.12.dev2' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/pycaption/__init__.py b/pycaption/__init__.py index be44c55c..adc9b501 100644 --- a/pycaption/__init__.py +++ b/pycaption/__init__.py @@ -34,6 +34,9 @@ def detect_format(caps): :returns: the reader class for the detected format. """ + if not len(caps): + raise CaptionReadNoCaptions("Empty caption file") + for reader in SUPPORTED_READERS: if reader().detect(caps): return reader diff --git a/pycaption/base.py b/pycaption/base.py index 8e3da975..1fa77895 100644 --- a/pycaption/base.py +++ b/pycaption/base.py @@ -1,18 +1,19 @@ import os +from collections import defaultdict from datetime import timedelta from numbers import Number from .exceptions import CaptionReadError, CaptionReadTimingError # `und` a special identifier for an undetermined language according to ISO 639-2 -DEFAULT_LANGUAGE_CODE = os.getenv('PYCAPTION_DEFAULT_LANG', 'und') +DEFAULT_LANGUAGE_CODE = os.getenv("PYCAPTION_DEFAULT_LANG", "und") def force_byte_string(content): try: - return content.encode('UTF-8') + return content.encode("UTF-8") except UnicodeEncodeError: - raise RuntimeError('Invalid content encoding') + raise RuntimeError("Invalid content encoding") except UnicodeDecodeError: return content @@ -50,8 +51,9 @@ def read(self, content): class BaseWriter: - def __init__(self, relativize=True, video_width=None, video_height=None, - fit_to_screen=True): + def __init__( + self, relativize=True, video_width=None, video_height=None, fit_to_screen=True + ): """ Initialize writer with the given parameters. @@ -81,7 +83,8 @@ def _relativize_and_fit_to_screen(self, layout_info): if self.relativize: # Transform absolute values (e.g. px) into percentages layout_info = layout_info.as_percentage_of( - self.video_width, self.video_height) + self.video_width, self.video_height + ) if self.fit_to_screen: # Make sure origin + extent <= 100% layout_info = layout_info.fit_to_screen() @@ -115,7 +118,7 @@ class CaptionNode: BREAK = 3 def __init__( - self, type_, layout_info=None, content=None, start=None, position=None + self, type_, layout_info=None, content=None, start=None, position=None ): """ :type type_: int @@ -135,30 +138,34 @@ def __repr__(self): if t == CaptionNode.TEXT: return repr(self.content) elif t == CaptionNode.BREAK: - return repr('BREAK') + return repr("BREAK") elif t == CaptionNode.STYLE: - return repr(f'STYLE: {self.start} {self.content}') + return repr(f"STYLE: {self.start} {self.content}") else: - raise RuntimeError(f'Unknown node type: {t}') + raise RuntimeError(f"Unknown node type: {t}") @staticmethod def create_text(text, layout_info=None, position=None): return CaptionNode( - type_=CaptionNode.TEXT, layout_info=layout_info, - position=position, content=text + type_=CaptionNode.TEXT, + layout_info=layout_info, + position=position, + content=text, ) @staticmethod def create_style(start, content, layout_info=None): return CaptionNode( - type_=CaptionNode.STYLE, layout_info=layout_info, content=content, - start=start) + type_=CaptionNode.STYLE, + layout_info=layout_info, + content=content, + start=start, + ) @staticmethod def create_break(layout_info=None, content=None): return CaptionNode( - type_=CaptionNode.BREAK, layout_info=layout_info, - content=content + type_=CaptionNode.BREAK, layout_info=layout_info, content=content ) @@ -184,11 +191,13 @@ def __init__(self, start, end, nodes, style={}, layout_info=None): :type layout_info: Layout """ if not isinstance(start, Number): - raise CaptionReadTimingError("Captions must be initialized with a" - " valid start time") + raise CaptionReadTimingError( + "Captions must be initialized with a" " valid start time" + ) if not isinstance(end, Number): - raise CaptionReadTimingError("Captions must be initialized with a" - " valid end time") + raise CaptionReadTimingError( + "Captions must be initialized with a" " valid end time" + ) if not nodes: raise CaptionReadError("Node list cannot be empty") self.start = start @@ -216,9 +225,7 @@ def format_end(self, msec_separator=None): return self._format_timestamp(self.end, msec_separator) def __repr__(self): - return repr( - f'{self.format_start()} --> {self.format_end()}\n{self.get_text()}' - ) + return repr(f"{self.format_start()} --> {self.format_end()}\n{self.get_text()}") def get_text_nodes(self): """ @@ -229,22 +236,24 @@ def get_text_for_node(node): if node.type_ == CaptionNode.TEXT: return node.content if node.type_ == CaptionNode.BREAK: - return '\n' - return '' + return "\n" + return "" return [get_text_for_node(node) for node in self.nodes] def get_text(self): text_nodes = self.get_text_nodes() - return ''.join(text_nodes).strip() + return "".join(text_nodes).strip() def _format_timestamp(self, microseconds, msec_separator=None): duration = timedelta(microseconds=microseconds) hours, rem = divmod(duration.seconds, 3600) minutes, seconds = divmod(rem, 60) milliseconds = f"{duration.microseconds // 1000:03d}" - timestamp = (f"{hours:02d}:{minutes:02d}:{seconds:02d}" - f"{msec_separator or '.'}{milliseconds:.3s}") + timestamp = ( + f"{hours:02d}:{minutes:02d}:{seconds:02d}" + f"{msec_separator or '.'}{milliseconds:.3s}" + ) return timestamp @@ -261,8 +270,7 @@ def __init__(self, iterable=None, layout_info=None): super().__init__(*args) def __getslice__(self, i, j): - return CaptionList( - list.__getslice__(self, i, j), layout_info=self.layout_info) + return CaptionList(list.__getslice__(self, i, j), layout_info=self.layout_info) def __getitem__(self, y): item = list.__getitem__(self, y) @@ -272,20 +280,19 @@ def __getitem__(self, y): def __add__(self, other): add_is_safe = ( - not hasattr(other, 'layout_info') + not hasattr(other, "layout_info") or not other.layout_info or self.layout_info == other.layout_info ) if add_is_safe: - return CaptionList( - list.__add__(self, other), layout_info=self.layout_info) + return CaptionList(list.__add__(self, other), layout_info=self.layout_info) else: raise ValueError( - "Cannot add CaptionList objects with different layout_info") + "Cannot add CaptionList objects with different layout_info" + ) def __mul__(self, other): - return CaptionList( - list.__mul__(self, other), layout_info=self.layout_info) + return CaptionList(list.__mul__(self, other), layout_info=self.layout_info) __rmul__ = __mul__ @@ -341,9 +348,7 @@ def set_styles(self, styles): self._styles = styles def is_empty(self): - return all( - [len(captions) == 0 for captions in list(self._captions.values())] - ) + return all([len(captions) == 0 for captions in list(self._captions.values())]) def set_layout_info(self, lang, layout_info): self._captions[lang].layout_info = layout_info @@ -412,6 +417,5 @@ def merge(captions): new_nodes.append(CaptionNode.create_break()) for node in caption.nodes: new_nodes.append(node) - caption = Caption( - captions[0].start, captions[0].end, new_nodes, captions[0].style) + caption = Caption(captions[0].start, captions[0].end, new_nodes, captions[0].style) return caption diff --git a/pycaption/scc/__init__.py b/pycaption/scc/__init__.py index ef74b406..398745ed 100644 --- a/pycaption/scc/__init__.py +++ b/pycaption/scc/__init__.py @@ -81,24 +81,38 @@ import math import re import textwrap -from collections import deque, defaultdict +from collections import defaultdict, deque from copy import deepcopy -from pycaption.base import ( - BaseReader, BaseWriter, CaptionSet +from pycaption.base import BaseReader, BaseWriter, CaptionNode, CaptionSet +from pycaption.exceptions import ( + CaptionLineLengthError, + CaptionReadNoCaptions, + CaptionReadTimingError, + InvalidInputError, ) -from pycaption.exceptions import CaptionReadNoCaptions, InvalidInputError, \ - CaptionReadTimingError, CaptionLineLengthError + from .constants import ( - HEADER, COMMANDS, SPECIAL_CHARS, EXTENDED_CHARS, CHARACTERS, - MICROSECONDS_PER_CODEWORD, CHARACTER_TO_CODE, - SPECIAL_OR_EXTENDED_CHAR_TO_CODE, PAC_BYTES_TO_POSITIONING_MAP, - PAC_HIGH_BYTE_BY_ROW, PAC_LOW_BYTE_BY_ROW_RESTRICTED, - PAC_TAB_OFFSET_COMMANDS, CUE_STARTING_COMMAND + CHARACTER_TO_CODE, + CHARACTERS, + COMMANDS, + CUE_STARTING_COMMAND, + EXTENDED_CHARS, + HEADER, + MICROSECONDS_PER_CODEWORD, + PAC_BYTES_TO_POSITIONING_MAP, + PAC_HIGH_BYTE_BY_ROW, + PAC_LOW_BYTE_BY_ROW_RESTRICTED, + PAC_TAB_OFFSET_COMMANDS, + SPECIAL_CHARS, + SPECIAL_OR_EXTENDED_CHAR_TO_CODE, ) -from .specialized_collections import ( # noqa: F401 - TimingCorrectingCaptionList, NotifyingDict, CaptionCreator, - InstructionNodeCreator, PopOnCue, +from .specialized_collections import CaptionCreator # noqa: F401 +from .specialized_collections import ( + InstructionNodeCreator, + NotifyingDict, + PopOnCue, + TimingCorrectingCaptionList, ) from .state_machines import DefaultProvidingPositionTracker @@ -112,8 +126,8 @@ class NodeCreatorFactory: this information must be erased after the reader's .read() operation completes. """ - def __init__(self, position_tracker, - node_creator=InstructionNodeCreator): + + def __init__(self, position_tracker, node_creator=InstructionNodeCreator): self.position_tracker = position_tracker self.node_creator = node_creator @@ -131,8 +145,7 @@ def from_list(self, roll_rows): :return: a node_creator instance """ return self.node_creator.from_list( - roll_rows, - position_tracker=self.position_tracker + roll_rows, position_tracker=self.position_tracker ) @@ -155,6 +168,7 @@ class SCCReader(BaseReader): This can be then later used for converting into any other supported formats """ + def __init__(self, *args, **kw): self.caption_stash = CaptionCreator() self.time_translator = _SccTimeTranslator() @@ -163,18 +177,18 @@ def __init__(self, *args, **kw): DefaultProvidingPositionTracker() ) - self.last_command = '' + self.last_command = "" self.double_starter = False self.buffer_dict = NotifyingDict() - self.buffer_dict['pop'] = self.node_creator_factory.new_creator() - self.buffer_dict['paint'] = self.node_creator_factory.new_creator() - self.buffer_dict['roll'] = self.node_creator_factory.new_creator() + self.buffer_dict["pop"] = self.node_creator_factory.new_creator() + self.buffer_dict["paint"] = self.node_creator_factory.new_creator() + self.buffer_dict["roll"] = self.node_creator_factory.new_creator() # Call this method when the active key changes self.buffer_dict.add_change_observer(self._flush_implicit_buffers) - self.buffer_dict.set_active('pop') + self.buffer_dict.set_active("pop") self.pop_ons_queue = deque() @@ -197,7 +211,7 @@ def detect(self, content): else: return False - def read(self, content, lang='en-US', simulate_roll_up=False, offset=0): + def read(self, content, lang="en-US", simulate_roll_up=False, offset=0): """Converts the unicode string into a CaptionSet :type content: str @@ -217,14 +231,13 @@ def read(self, content, lang='en-US', simulate_roll_up=False, offset=0): :rtype: CaptionSet """ if not isinstance(content, str): - raise InvalidInputError('The content is not a unicode string.') + raise InvalidInputError("The content is not a unicode string.") self.simulate_roll_up = simulate_roll_up self.time_translator.offset = offset * 1000000 # split lines lines = content.splitlines() - # loop through each line except the first for line in lines[1:]: self._translate_line(line) @@ -238,7 +251,9 @@ def read(self, content, lang='en-US', simulate_roll_up=False, offset=0): for caption in self.caption_stash._collection: caption_start = caption.to_real_caption().format_start() caption_text = "".join(caption.to_real_caption().get_text_nodes()) - text_too_long = [line for line in caption_text.split("\n") if len(line) > 32] + text_too_long = [ + line for line in caption_text.split("\n") if len(line) > 32 + ] if caption_start in lines_too_long: lines_too_long[caption_start] = text_too_long else: @@ -264,9 +279,10 @@ def read(self, content, lang='en-US', simulate_roll_up=False, offset=0): # EOC marker in the SCC file) if 0 < cap.end - cap.start < 50000: raise CaptionReadTimingError( - f'Unsupported cue duration around {cap.format_start()} ' + f"Unsupported cue duration around {cap.format_start()} " f'for line beginning with "{cap.get_text()}". Duration ' - f'must be at least 0.05 seconds.') + f"must be at least 0.05 seconds." + ) if captions.is_empty(): raise CaptionReadNoCaptions("empty caption file") @@ -286,22 +302,22 @@ def _flush_implicit_buffers(self, old_key=None, *args): If they're on the last row however, or if the caption type is changing, we make sure to convert the buffers to text, so we don't lose any info. """ - if old_key == 'pop': + if old_key == "pop": if self.pop_ons_queue: self._pop_on() - elif old_key == 'roll': + elif old_key == "roll": if not self.buffer.is_empty(): self._roll_up() - elif old_key == 'paint': + elif old_key == "paint": if not self.buffer.is_empty(): self.caption_stash.create_and_store(self.buffer, self.time) self.buffer = self.node_creator_factory.new_creator() def _translate_line(self, line): # ignore blank lines - if line.strip() == '': + if line.strip() == "": return # split line in timestamp and words @@ -309,20 +325,15 @@ def _translate_line(self, line): parts = r.findall(line.lower()) self.time_translator.start_at(parts[0][0]) - word_list = parts[0][2].split(' ') + word_list = parts[0][2].split(" ") for idx, word in enumerate(word_list): word = word.strip() - previous_is_pac_or_tab = len(word_list) > 1 and ( - _is_pac_command(word_list[idx - 1]) or word_list[idx - 1] in PAC_TAB_OFFSET_COMMANDS - ) if len(word) == 4: - self._translate_word( - word=word, - previous_is_pac_or_tab=previous_is_pac_or_tab, - ) + next_command = word_list[idx + 1] if idx + 1 < len(word_list) else None + self._translate_word(word=word, next_command=next_command) - def _translate_word(self, word, previous_is_pac_or_tab): + def _translate_word(self, word, next_command=None): if self._handle_double_command(word): # count frames for timing self.time_translator.increment_frames() @@ -331,7 +342,7 @@ def _translate_word(self, word, previous_is_pac_or_tab): # TODO - check that all the positioning commands are here, or use # some other strategy to determine if the word is a command. if word in COMMANDS or _is_pac_command(word): - self._translate_command(word=word, previous_is_pac_or_tab=previous_is_pac_or_tab) + self._translate_command(word=word, next_command=next_command) # second, check if word is a special character elif word in SPECIAL_CHARS: @@ -358,7 +369,12 @@ def _handle_double_command(self, word): doubled_types = word != "94a1" and word in COMMANDS or _is_pac_command(word) if self.double_starter: - doubled_types = doubled_types or word in EXTENDED_CHARS or word == "94a1" or word in SPECIAL_CHARS + doubled_types = ( + doubled_types + or word in EXTENDED_CHARS + or word == "94a1" + or word in SPECIAL_CHARS + ) if word in CUE_STARTING_COMMAND and word != self.last_command: self.double_starter = False @@ -366,12 +382,12 @@ def _handle_double_command(self, word): if doubled_types and word == self.last_command: if word in CUE_STARTING_COMMAND: self.double_starter = True - self.last_command = '' + self.last_command = "" return True # Fix for the # repetition elif _is_pac_command(word) and word in self.last_command: - self.last_command = '' + self.last_command = "" return True elif word in PAC_TAB_OFFSET_COMMANDS: if _is_pac_command(self.last_command): @@ -398,40 +414,37 @@ def _translate_extended_char(self, word): # add to buffer self.buffer.add_chars(EXTENDED_CHARS[word]) - def _translate_command(self, word, previous_is_pac_or_tab): + def _translate_command(self, word, next_command=None): # if command is pop_up - if word == '9420': - self.buffer_dict.set_active('pop') + if word == "9420": + self.buffer_dict.set_active("pop") # command is paint_on [Resume Direct Captioning] - elif word == '9429': - self.buffer_dict.set_active('paint') + elif word == "9429": + self.buffer_dict.set_active("paint") self.roll_rows_expected = 1 if not self.buffer.is_empty(): - self.caption_stash.create_and_store( - self.buffer, self.time - ) + self.caption_stash.create_and_store(self.buffer, self.time) self.buffer = self.node_creator_factory.new_creator() self.time = self.time_translator.get_time() # if command is roll_up 2, 3 or 4 rows - elif word in ('9425', '9426', '94a7'): - self.buffer_dict.set_active('roll') + elif word in ("9425", "9426", "94a7"): + self.buffer_dict.set_active("roll") # count how many lines are expected - if word == '9425': + if word == "9425": self.roll_rows_expected = 2 - elif word == '9426': + elif word == "9426": self.roll_rows_expected = 3 - elif word == '94a7': + elif word == "94a7": self.roll_rows_expected = 4 # if content is in the queue, turn it into a caption if not self.buffer.is_empty(): - self.caption_stash.create_and_store( - self.buffer, self.time) + self.caption_stash.create_and_store(self.buffer, self.time) self.buffer = self.node_creator_factory.new_creator() # set rows to empty, configure start time for caption @@ -439,11 +452,11 @@ def _translate_command(self, word, previous_is_pac_or_tab): self.time = self.time_translator.get_time() # clear pop_on buffer - elif word == '94ae': + elif word == "94ae": self.buffer = self.node_creator_factory.new_creator() # display pop_on buffer [End Of Caption] - elif word == '942f': + elif word == "942f": self.time = self.time_translator.get_time() if self.pop_ons_queue: # there's a pop-on cue not ended by the 942c command @@ -455,22 +468,19 @@ def _translate_command(self, word, previous_is_pac_or_tab): self.buffer = self.node_creator_factory.new_creator() # roll up captions [Carriage Return] - elif word == '94ad': + elif word == "94ad": # display roll-up buffer if not self.buffer.is_empty(): self._roll_up() # 942c - Erase Displayed Memory - Clear the current screen of any # displayed captions or text. - elif word == '942c' and self.pop_ons_queue: + elif word == "942c" and self.pop_ons_queue: self._pop_on(end=self.time_translator.get_time()) # If command is not one of the aforementioned, add it to buffer else: - self.buffer.interpret_command( - command=word, - previous_is_pac_or_tab=previous_is_pac_or_tab - ) + self.buffer.interpret_command(command=word, next_command=next_command) def _translate_characters(self, word): # split word into the 2 bytes @@ -508,8 +518,7 @@ def _roll_up(self): self.roll_rows.pop(0) self.roll_rows.append(self.buffer) - self.buffer = self.node_creator_factory.from_list( - self.roll_rows) + self.buffer = self.node_creator_factory.from_list(self.roll_rows) # convert buffer and empty self.caption_stash.create_and_store(self.buffer, self.time) @@ -523,8 +532,7 @@ def _roll_up(self): def _pop_on(self, end=0): pop_on_cue = self.pop_ons_queue.pop() - self.caption_stash.create_and_store( - pop_on_cue.buffer, pop_on_cue.start, end) + self.caption_stash.create_and_store(pop_on_cue.buffer, pop_on_cue.start, end) class SCCWriter(BaseWriter): @@ -532,7 +540,7 @@ def __init__(self, *args, **kw): super().__init__(*args, **kw) def write(self, caption_set): - output = HEADER + '\n\n' + output = HEADER + "\n\n" if caption_set.is_empty(): return output @@ -544,8 +552,10 @@ def write(self, caption_set): captions = caption_set.get_captions(lang) # PASS 1: compute codes for each caption - codes = [(self._text_to_code(caption), caption.start, caption.end) - for caption in captions] + codes = [ + (self._text_to_code(caption), caption.start, caption.end) + for caption in captions + ] # PASS 2: # Advance start times so as to have time to write to the pop-on @@ -563,13 +573,13 @@ def write(self, caption_set): # PASS 3: # Write captions. - for (code, start, end) in codes: - output += f'{self._format_timestamp(start)}\t' - output += '94ae 94ae 9420 9420 ' + for code, start, end in codes: + output += f"{self._format_timestamp(start)}\t" + output += "94ae 94ae 9420 9420 " output += code - output += '942c 942c 942f 942f\n\n' + output += "942c 942c 942f 942f\n\n" if end is not None: - output += f'{self._format_timestamp(end)}\t942c 942c\n\n' + output += f"{self._format_timestamp(end)}\t942c 942c\n\n" return output @@ -577,21 +587,21 @@ def write(self, caption_set): @staticmethod def _layout_line(caption): caption_text = "".join(caption.get_text_nodes()) - inner_lines = caption_text.split('\n') + inner_lines = caption_text.split("\n") inner_lines_laid_out = [textwrap.fill(x, 32) for x in inner_lines] - return '\n'.join(inner_lines_laid_out) + return "\n".join(inner_lines_laid_out) @staticmethod def _maybe_align(code): # Finish a half-word with a no-op so we can move to a full word if len(code) % 5 == 2: - code += '80 ' + code += "80 " return code @staticmethod def _maybe_space(code): if len(code) % 5 == 4: - code += ' ' + code += " " return code def _print_character(self, code, char): @@ -601,7 +611,7 @@ def _print_character(self, code, char): try: char_code = SPECIAL_OR_EXTENDED_CHAR_TO_CODE[char] except KeyError: - char_code = '91b6' # Use £ as "unknown character" symbol + char_code = "91b6" # Use £ as "unknown character" symbol if len(char_code) == 2: return code + char_code @@ -612,14 +622,16 @@ def _print_character(self, code, char): return code def _text_to_code(self, s): - code = '' - lines = self._layout_line(s).split('\n') + code = "" + lines = self._layout_line(s).split("\n") for row, line in enumerate(lines): row += 16 - len(lines) # Move cursor to column 0 of the destination row for _ in range(2): - code += (PAC_HIGH_BYTE_BY_ROW[row] - + f'{PAC_LOW_BYTE_BY_ROW_RESTRICTED[row]} ') + code += ( + PAC_HIGH_BYTE_BY_ROW[row] + + f"{PAC_LOW_BYTE_BY_ROW_RESTRICTED[row]} " + ) # Print the line using the SCC encoding for char in line: code = self._print_character(code, char) @@ -639,14 +651,14 @@ def _format_timestamp(microseconds): seconds = math.floor(seconds_float) seconds_float -= seconds frames = math.floor(seconds_float * 30) - return f'{hours:02}:{minutes:02}:{seconds:02}:{frames:02}' + return f"{hours:02}:{minutes:02}:{seconds:02}:{frames:02}" class _SccTimeTranslator: """Converts SCC time to microseconds, keeping track of frames passed""" def __init__(self): - self._time = '00:00:00;00' + self._time = "00:00:00;00" # microseconds. The offset from which we begin the time calculation self.offset = 0 @@ -659,8 +671,7 @@ def get_time(self): :rtype: int """ return self._translate_time( - self._time[:-2] + str(int(self._time[-2:]) + self._frames), - self.offset + self._time[:-2] + str(int(self._time[-2:]) + self._frames), self.offset ) @staticmethod @@ -672,13 +683,14 @@ def _translate_time(stamp, offset): Helpful for when the captions are off by some time interval. :rtype: int """ - if not re.match(r'\d{2}:\d{2}:\d{2}[:;]\d{1,2}', stamp): + if not re.match(r"\d{2}:\d{2}:\d{2}[:;]\d{1,2}", stamp): raise CaptionReadTimingError( "Timestamps should follow the hour:minute:seconds" ";frames or hour:minute:seconds:frames format. Please correct " - f"the following time: {stamp}.") + f"the following time: {stamp}." + ) - if ';' in stamp: + if ";" in stamp: # Drop-frame timebase runs at the same rate as wall clock seconds_per_timestamp_second = 1.0 else: @@ -686,12 +698,14 @@ def _translate_time(stamp, offset): # 1 second of timecode is longer than an actual second (1.001s) seconds_per_timestamp_second = 1001.0 / 1000.0 - time_split = stamp.replace(';', ':').split(':') + time_split = stamp.replace(";", ":").split(":") - timestamp_seconds = (int(time_split[0]) * 3600 - + int(time_split[1]) * 60 - + int(time_split[2]) - + int(time_split[3]) / 30.0) + timestamp_seconds = ( + int(time_split[0]) * 3600 + + int(time_split[1]) * 60 + + int(time_split[2]) + + int(time_split[3]) / 30.0 + ) seconds = timestamp_seconds * seconds_per_timestamp_second microseconds = seconds * 1000 * 1000 - offset diff --git a/pycaption/scc/constants.py b/pycaption/scc/constants.py index bc2fcd50..c8d5cefb 100644 --- a/pycaption/scc/constants.py +++ b/pycaption/scc/constants.py @@ -521,22 +521,22 @@ '97a8': '', '9729': '', '972a': '', - '9120': '<$>{end-italic}<$>', - '91a1': '', - '91a2': '', - '9123': '', - '91a4': '', - '9125': '', - '9126': '', - '91a7': '', - '91a8': '', - '9129': '', - '912a': '', - '91ab': '', - '912c': '', - '91ad': '', - '97ae': '', - '972f': '', + '9120': '<$>{end-italic}<$>', # plain white + '91a1': '<$>{end-italic}<$>', # white underlined + '91a2': '<$>{end-italic}<$>', # plain green + '9123': '<$>{end-italic}<$>', # green underlined + '91a4': '<$>{end-italic}<$>', # plain blue + '9125': '<$>{end-italic}<$>', # blue underlined + '9126': '<$>{end-italic}<$>', # plain cyan + '91a7': '<$>{end-italic}<$>', # cyan underlined + '91a8': '<$>{end-italic}<$>', # plain red + '9129': '<$>{end-italic}<$>', # red underlined + '912a': '<$>{end-italic}<$>', # plain yellow + '91ab': '<$>{end-italic}<$>', # yellow underlined + '912c': '<$>{end-italic}<$>', # plain magenta + '91ad': '<$>{end-italic}<$>', # magenta underlined + '97ae': '<$>{end-italic}<$>', # plain black + '972f': '<$>{end-italic}<$>', # black underlined '91ae': '<$>{italic}<$>', '912f': '<$>{italic}<$>', '94a8': '', @@ -1060,3 +1060,569 @@ def _restructure_bytes_to_position_map(byte_to_pos_map): ] CUE_STARTING_COMMAND = ['9425', '9426', '94a7', '9429', '9420'] + +ALL_CHARACTERS = {**CHARACTERS, **SPECIAL_CHARS, **EXTENDED_CHARS} + +COMMAND_LABELS = { + "9420": "Resume Caption Loading", + "9429": "Resume Direct Captioning", + "9425": "Roll-Up Captions--2 Rows", + "9426": "Roll-Up Captions--3 Rows", + "94a7": "Roll-Up Captions--4 Rows", + "942a": "Text Restart", + "94ab": "Resume Text Display", + "942c": "Erase Displayed Memory", + "94ae": "Erase Non-displayed Memory", + "942f": "End Of Caption", + "9140": "row 01, column 00, with plain white text.", + "91c1": "row 01, column 00, with white underlined text.", + "91c2": "row 01, column 00, with plain green text.", + "9143": "row 01, column 00, with green underlined text.", + "91c4": "row 01, column 00, with plain blue text.", + "9145": "row 01, column 00, with blue underlined text.", + "9146": "row 01, column 00, with plain cyan text.", + "91c7": "row 01, column 00, with cyan underlined text.", + "91c8": "row 01, column 00, with plain red text.", + "9149": "row 01, column 00, with red underlined text.", + "914a": "row 01, column 00, with plain yellow text.", + "91cb": "row 01, column 00, with yellow underlined text.", + "914c": "row 01, column 00, with plain magenta text.", + "91cd": "row 01, column 00, with magenta underlined text.", + "91ce": "row 01, column 00, with white italicized text.", + "914f": "row 01, column 00, with white underlined italicized text.", + "91d0": "row 01, column 00, with plain white text.", + "9151": "row 01, column 00, with white underlined text.", + "9152": "row 01, column 04, with plain white text.", + "91d3": "row 01, column 04, with white underlined text.", + "9154": "row 01, column 08, with plain white text.", + "91d5": "row 01, column 08, with white underlined text.", + "91d6": "row 01, column 12, with plain white text.", + "9157": "row 01, column 12, with white underlined text.", + "9158": "row 01, column 16, with plain white text.", + "91d9": "row 01, column 16, with white underlined text.", + "91da": "row 01, column 20, with plain white text.", + "915b": "row 01, column 20, with white underlined text.", + "91dc": "row 01, column 24, with plain white text.", + "915d": "row 01, column 24, with white underlined text.", + "915e": "row 01, column 28, with plain white text.", + "91df": "row 01, column 28, with white underlined text.", + "91e0": "row 02, column 00, with plain white text.", + "9161": "row 02, column 00, with white underlined text.", + "9162": "row 02, column 00, with plain green text.", + "91e3": "row 02, column 00, with green underlined text.", + "9164": "row 02, column 00, with plain blue text.", + "91e5": "row 02, column 00, with blue underlined text.", + "91e6": "row 02, column 00, with plain cyan text.", + "9167": "row 02, column 00, with cyan underlined text.", + "9168": "row 02, column 00, with plain red text.", + "91e9": "row 02, column 00, with red underlined text.", + "91ea": "row 02, column 00, with plain yellow text.", + "916b": "row 02, column 00, with yellow underlined text.", + "91ec": "row 02, column 00, with plain magenta text.", + "916d": "row 02, column 00, with magenta underlined text.", + "916e": "row 02, column 00, with white italicized text.", + "91ef": "row 02, column 00, with white underlined italicized text.", + "9170": "row 02, column 00, with plain white text.", + "91f1": "row 02, column 00, with white underlined text.", + "91f2": "row 02, column 04, with plain white text.", + "9173": "row 02, column 04, with white underlined text.", + "91f4": "row 02, column 08, with plain white text.", + "9175": "row 02, column 08, with white underlined text.", + "9176": "row 02, column 12, with plain white text.", + "91f7": "row 02, column 12, with white underlined text.", + "91f8": "row 02, column 16, with plain white text.", + "9179": "row 02, column 16, with white underlined text.", + "917a": "row 02, column 20, with plain white text.", + "91fb": "row 02, column 20, with white underlined text.", + "91fc": "row 02, column 24, with plain white text.", + "91fd": "row 02, column 24, with white underlined text.", + "91fe": "row 02, column 28, with plain white text.", + "917f": "row 02, column 28, with white underlined text.", + "9240": "row 03, column 00, with plain white text.", + "92c1": "row 03, column 00, with white underlined text.", + "92c2": "row 03, column 00, with plain green text.", + "9243": "row 03, column 00, with green underlined text.", + "92c4": "row 03, column 00, with plain blue text.", + "9245": "row 03, column 00, with blue underlined text.", + "9246": "row 03, column 00, with plain cyan text.", + "92c7": "row 03, column 00, with cyan underlined text.", + "92c8": "row 03, column 00, with plain red text.", + "9249": "row 03, column 00, with red underlined text.", + "924a": "row 03, column 00, with plain yellow text.", + "92cb": "row 03, column 00, with yellow underlined text.", + "924c": "row 03, column 00, with plain magenta text.", + "92cd": "row 03, column 00, with magenta underlined text.", + "92ce": "row 03, column 00, with white italicized text.", + "924f": "row 03, column 00, with white underlined italicized text.", + "92d0": "row 03, column 00, with plain white text.", + "9251": "row 03, column 00, with white underlined text.", + "9252": "row 03, column 04, with plain white text.", + "92d3": "row 03, column 04, with white underlined text.", + "9254": "row 03, column 08, with plain white text.", + "92d5": "row 03, column 08, with white underlined text.", + "92d6": "row 03, column 12, with plain white text.", + "9257": "row 03, column 12, with white underlined text.", + "9258": "row 03, column 16, with plain white text.", + "92d9": "row 03, column 16, with white underlined text.", + "92da": "row 03, column 20, with plain white text.", + "925b": "row 03, column 20, with white underlined text.", + "92dc": "row 03, column 24, with plain white text.", + "925d": "row 03, column 24, with white underlined text.", + "925e": "row 03, column 28, with plain white text.", + "92df": "row 03, column 28, with white underlined text.", + "92e0": "row 04, column 00, with plain white text.", + "9261": "row 04, column 00, with white underlined text.", + "9262": "row 04, column 00, with plain green text.", + "92e3": "row 04, column 00, with green underlined text.", + "9264": "row 04, column 00, with plain blue text.", + "92e5": "row 04, column 00, with blue underlined text.", + "92e6": "row 04, column 00, with plain cyan text.", + "9267": "row 04, column 00, with cyan underlined text.", + "9268": "row 04, column 00, with plain red text.", + "92e9": "row 04, column 00, with red underlined text.", + "92ea": "row 04, column 00, with plain yellow text.", + "926b": "row 04, column 00, with yellow underlined text.", + "92ec": "row 04, column 00, with plain magenta text.", + "926d": "row 04, column 00, with magenta underlined text.", + "926e": "row 04, column 00, with white italicized text.", + "92ef": "row 04, column 00, with white underlined italicized text.", + "9270": "row 04, column 00, with plain white text.", + "92f1": "row 04, column 00, with white underlined text.", + "92f2": "row 04, column 04, with plain white text.", + "9273": "row 04, column 04, with white underlined text.", + "92f4": "row 04, column 08, with plain white text.", + "9275": "row 04, column 08, with white underlined text.", + "9276": "row 04, column 12, with plain white text.", + "92f7": "row 04, column 12, with white underlined text.", + "92f8": "row 04, column 16, with plain white text.", + "9279": "row 04, column 16, with white underlined text.", + "927a": "row 04, column 20, with plain white text.", + "92fb": "row 04, column 20, with white underlined text.", + "92fc": "row 04, column 24, with plain white text.", + "92fd": "row 04, column 24, with white underlined text.", + "92fe": "row 04, column 28, with plain white text.", + "927f": "row 04, column 28, with white underlined text.", + "1540": "row 05, column 00, with plain white text.", + "15c1": "row 05, column 00, with white underlined text.", + "15c2": "row 05, column 00, with plain green text.", + "1543": "row 05, column 00, with green underlined text.", + "15c4": "row 05, column 00, with plain blue text.", + "1545": "row 05, column 00, with blue underlined text.", + "1546": "row 05, column 00, with plain cyan text.", + "15c7": "row 05, column 00, with cyan underlined text.", + "15c8": "row 05, column 00, with plain red text.", + "1549": "row 05, column 00, with red underlined text.", + "154a": "row 05, column 00, with plain yellow text.", + "15cb": "row 05, column 00, with yellow underlined text.", + "154c": "row 05, column 00, with plain magenta text.", + "15cd": "row 05, column 00, with magenta underlined text.", + "15ce": "row 05, column 00, with white italicized text.", + "154f": "row 05, column 00, with white underlined italicized text.", + "15d0": "row 05, column 00, with plain white text.", + "1551": "row 05, column 00, with white underlined text.", + "1552": "row 05, column 04, with plain white text.", + "15d3": "row 05, column 04, with white underlined text.", + "1554": "row 05, column 08, with plain white text.", + "15d5": "row 05, column 08, with white underlined text.", + "15d6": "row 05, column 12, with plain white text.", + "1557": "row 05, column 12, with white underlined text.", + "1558": "row 05, column 16, with plain white text.", + "15d9": "row 05, column 16, with white underlined text.", + "15da": "row 05, column 20, with plain white text.", + "155b": "row 05, column 20, with white underlined text.", + "15dc": "row 05, column 24, with plain white text.", + "155d": "row 05, column 24, with white underlined text.", + "155e": "row 05, column 28, with plain white text.", + "15df": "row 05, column 28, with white underlined text.", + "15e0": "row 06, column 00, with plain white text.", + "1561": "row 06, column 00, with white underlined text.", + "15462": "row 06, column 00, with plain green text.", + "15e3": "row 06, column 00, with green underlined text.", + "1564": "row 06, column 00, with plain blue text.", + "15e5": "row 06, column 00, with blue underlined text.", + "15e6": "row 06, column 00, with plain cyan text.", + "1567": "row 06, column 00, with cyan underlined text.", + "1568": "row 06, column 00, with plain red text.", + "15e9": "row 06, column 00, with red underlined text.", + "15ea": "row 06, column 00, with plain yellow text.", + "156b": "row 06, column 00, with yellow underlined text.", + "15ec": "row 06, column 00, with plain magenta text.", + "156d": "row 06, column 00, with magenta underlined text.", + "156e": "row 06, column 00, with white italicized text.", + "15ef": "row 06, column 00, with white underlined italicized text.", + "1570": "row 06, column 00, with plain white text.", + "15f1": "row 06, column 00, with white underlined text.", + "15f2": "row 06, column 04, with plain white text.", + "1573": "row 06, column 04, with white underlined text.", + "15f4": "row 06, column 08, with plain white text.", + "1575": "row 06, column 08, with white underlined text.", + "1576": "row 06, column 12, with plain white text.", + "15f7": "row 06, column 12, with white underlined text.", + "15f8": "row 06, column 16, with plain white text.", + "1579": "row 06, column 16, with white underlined text.", + "157a": "row 06, column 20, with plain white text.", + "15fb": "row 06, column 20, with white underlined text.", + "15fc": "row 06, column 24, with plain white text.", + "15fd": "row 06, column 24, with white underlined text.", + "15fe": "row 06, column 28, with plain white text.", + "157f": "row 06, column 28, with white underlined text.", + "1640": "row 07, column 00, with plain white text.", + "16c1": "row 07, column 00, with white underlined text.", + "16c2": "row 07, column 00, with plain green text.", + "1643": "row 07, column 00, with green underlined text.", + "16c4": "row 07, column 00, with plain blue text.", + "1645": "row 07, column 00, with blue underlined text.", + "1646": "row 07, column 00, with plain cyan text.", + "16c7": "row 07, column 00, with cyan underlined text.", + "16c8": "row 07, column 00, with plain red text.", + "1649": "row 07, column 00, with red underlined text.", + "164a": "row 07, column 00, with plain yellow text.", + "16cb": "row 07, column 00, with yellow underlined text.", + "164c": "row 07, column 00, with plain magenta text.", + "16cd": "row 07, column 00, with magenta underlined text.", + "16ce": "row 07, column 00, with white italicized text.", + "164f": "row 07, column 00, with white underlined italicized text.", + "16d0": "row 07, column 00, with plain white text.", + "1651": "row 07, column 00, with white underlined text.", + "1652": "row 07, column 04, with plain white text.", + "16d3": "row 07, column 04, with white underlined text.", + "1654": "row 07, column 08, with plain white text.", + "16d5": "row 07, column 08, with white underlined text.", + "16d6": "row 07, column 12, with plain white text.", + "1657": "row 07, column 12, with white underlined text.", + "1658": "row 07, column 16, with plain white text.", + "16d9": "row 07, column 16, with white underlined text.", + "16da": "row 07, column 20, with plain white text.", + "165b": "row 07, column 20, with white underlined text.", + "16dc": "row 07, column 24, with plain white text.", + "165d": "row 07, column 24, with white underlined text.", + "165e": "row 07, column 28, with plain white text.", + "16df": "row 07, column 28, with white underlined text.", + "16e0": "row 08, column 00, with plain white text.", + "1661": "row 08, column 00, with white underlined text.", + "16462": "row 08, column 00, with plain green text.", + "16e3": "row 08, column 00, with green underlined text.", + "1664": "row 08, column 00, with plain blue text.", + "16e5": "row 08, column 00, with blue underlined text.", + "16e6": "row 08, column 00, with plain cyan text.", + "1667": "row 08, column 00, with cyan underlined text.", + "1668": "row 08, column 00, with plain red text.", + "16e9": "row 08, column 00, with red underlined text.", + "16ea": "row 08, column 00, with plain yellow text.", + "166b": "row 08, column 00, with yellow underlined text.", + "16ec": "row 08, column 00, with plain magenta text.", + "166d": "row 08, column 00, with magenta underlined text.", + "166e": "row 08, column 00, with white italicized text.", + "16ef": "row 08, column 00, with white underlined italicized text.", + "1670": "row 08, column 00, with plain white text.", + "16f1": "row 08, column 00, with white underlined text.", + "16f2": "row 08, column 04, with plain white text.", + "1673": "row 08, column 04, with white underlined text.", + "16f4": "row 08, column 08, with plain white text.", + "1675": "row 08, column 08, with white underlined text.", + "1676": "row 08, column 12, with plain white text.", + "16f7": "row 08, column 12, with white underlined text.", + "16f8": "row 08, column 16, with plain white text.", + "1679": "row 08, column 16, with white underlined text.", + "167a": "row 08, column 20, with plain white text.", + "16fb": "row 08, column 20, with white underlined text.", + "16fc": "row 08, column 24, with plain white text.", + "16fd": "row 08, column 24, with white underlined text.", + "16fe": "row 08, column 28, with plain white text.", + "167f": "row 08, column 28, with white underlined text.", + "9740": "row 09, column 00, with plain white text.", + "97c1": "row 09, column 00, with white underlined text.", + "97c2": "row 09, column 00, with plain green text.", + "9743": "row 09, column 00, with green underlined text.", + "97c4": "row 09, column 00, with plain blue text.", + "9745": "row 09, column 00, with blue underlined text.", + "9746": "row 09, column 00, with plain cyan text.", + "97c7": "row 09, column 00, with cyan underlined text.", + "97c8": "row 09, column 00, with plain red text.", + "9749": "row 09, column 00, with red underlined text.", + "974a": "row 09, column 00, with plain yellow text.", + "97cb": "row 09, column 00, with yellow underlined text.", + "974c": "row 09, column 00, with plain magenta text.", + "97cd": "row 09, column 00, with magenta underlined text.", + "97ce": "row 09, column 00, with white italicized text.", + "974f": "row 09, column 00, with white underlined italicized text.", + "97d0": "row 09, column 00, with plain white text.", + "9751": "row 09, column 00, with white underlined text.", + "9752": "row 09, column 04, with plain white text.", + "97d3": "row 09, column 04, with white underlined text.", + "9754": "row 09, column 08, with plain white text.", + "97d5": "row 09, column 08, with white underlined text.", + "97d6": "row 09, column 12, with plain white text.", + "9757": "row 09, column 12, with white underlined text.", + "9758": "row 09, column 16, with plain white text.", + "97d9": "row 09, column 16, with white underlined text.", + "97da": "row 09, column 20, with plain white text.", + "975b": "row 09, column 20, with white underlined text.", + "97dc": "row 09, column 24, with plain white text.", + "975d": "row 09, column 24, with white underlined text.", + "975e": "row 09, column 28, with plain white text.", + "97df": "row 09, column 28, with white underlined text.", + "97e0": "row 10, column 00, with plain white text.", + "9761": "row 10, column 00, with white underlined text.", + "9762": "row 10, column 00, with plain green text.", + "97e3": "row 10, column 00, with green underlined text.", + "9764": "row 10, column 00, with plain blue text.", + "97e5": "row 10, column 00, with blue underlined text.", + "97e6": "row 10, column 00, with plain cyan text.", + "9767": "row 10, column 00, with cyan underlined text.", + "9768": "row 10, column 00, with plain red text.", + "97e9": "row 10, column 00, with red underlined text.", + "97ea": "row 10, column 00, with plain yellow text.", + "976b": "row 10, column 00, with yellow underlined text.", + "97ec": "row 10, column 00, with plain magenta text.", + "976d": "row 10, column 00, with magenta underlined text.", + "976e": "row 10, column 00, with white italicized text.", + "97ef": "row 10, column 00, with white underlined italicized text.", + "9770": "row 10, column 00, with plain white text.", + "97f1": "row 10, column 00, with white underlined text.", + "97f2": "row 10, column 04, with plain white text.", + "9773": "row 10, column 04, with white underlined text.", + "97f4": "row 10, column 08, with plain white text.", + "9775": "row 10, column 08, with white underlined text.", + "9776": "row 10, column 12, with plain white text.", + "97f7": "row 10, column 12, with white underlined text.", + "97f8": "row 10, column 16, with plain white text.", + "9779": "row 10, column 16, with white underlined text.", + "977a": "row 10, column 20, with plain white text.", + "97fb": "row 10, column 20, with white underlined text.", + "97fc": "row 10, column 24, with plain white text.", + "97fd": "row 10, column 24, with white underlined text.", + "97fe": "row 10, column 28, with plain white text.", + "977f": "row 10, column 28, with white underlined text.", + "1040": "row 11, column 00, with plain white text.", + "10c1": "row 11, column 00, with white underlined text.", + "10c2": "row 11, column 00, with plain green text.", + "1043": "row 11, column 00, with green underlined text.", + "10c4": "row 11, column 00, with plain blue text.", + "1045": "row 11, column 00, with blue underlined text.", + "1046": "row 11, column 00, with plain cyan text.", + "10c7": "row 11, column 00, with cyan underlined text.", + "10c8": "row 11, column 00, with plain red text.", + "1049": "row 11, column 00, with red underlined text.", + "104a": "row 11, column 00, with plain yellow text.", + "10cb": "row 11, column 00, with yellow underlined text.", + "104c": "row 11, column 00, with plain magenta text.", + "10cd": "row 11, column 00, with magenta underlined text.", + "10ce": "row 11, column 00, with white italicized text.", + "104f": "row 11, column 00, with white underlined italicized text.", + "10d0": "row 11, column 00, with plain white text.", + "1051": "row 11, column 00, with white underlined text.", + "1052": "row 11, column 04, with plain white text.", + "10d3": "row 11, column 04, with white underlined text.", + "1054": "row 11, column 08, with plain white text.", + "10d5": "row 11, column 08, with white underlined text.", + "10d6": "row 11, column 12, with plain white text.", + "1057": "row 11, column 12, with white underlined text.", + "1058": "row 11, column 16, with plain white text.", + "10d9": "row 11, column 16, with white underlined text.", + "10da": "row 11, column 20, with plain white text.", + "105b": "row 11, column 20, with white underlined text.", + "10dc": "row 11, column 24, with plain white text.", + "105d": "row 11, column 24, with white underlined text.", + "105e": "row 11, column 28, with plain white text.", + "10df": "row 11, column 28, with white underlined text.", + "1340": "row 12, column 00, with plain white text.", + "13c1": "row 12, column 00, with white underlined text.", + "13c2": "row 12, column 00, with plain green text.", + "1343": "row 12, column 00, with green underlined text.", + "13c4": "row 12, column 00, with plain blue text.", + "1345": "row 12, column 00, with blue underlined text.", + "1346": "row 12, column 00, with plain cyan text.", + "13c7": "row 12, column 00, with cyan underlined text.", + "13c8": "row 12, column 00, with plain red text.", + "1349": "row 12, column 00, with red underlined text.", + "134a": "row 12, column 00, with plain yellow text.", + "13cb": "row 12, column 00, with yellow underlined text.", + "134c": "row 12, column 00, with plain magenta text.", + "13cd": "row 12, column 00, with magenta underlined text.", + "13ce": "row 12, column 00, with white italicized text.", + "134f": "row 12, column 00, with white underlined italicized text.", + "13d0": "row 12, column 00, with plain white text.", + "1351": "row 12, column 00, with white underlined text.", + "1352": "row 12, column 04, with plain white text.", + "13d3": "row 12, column 04, with white underlined text.", + "1354": "row 12, column 08, with plain white text.", + "13d5": "row 12, column 08, with white underlined text.", + "13d6": "row 12, column 12, with plain white text.", + "1357": "row 12, column 12, with white underlined text.", + "1358": "row 12, column 16, with plain white text.", + "13d9": "row 12, column 16, with white underlined text.", + "13da": "row 12, column 20, with plain white text.", + "135b": "row 12, column 20, with white underlined text.", + "13dc": "row 12, column 24, with plain white text.", + "135d": "row 12, column 24, with white underlined text.", + "135e": "row 12, column 28, with plain white text.", + "13df": "row 12, column 28, with white underlined text.", + "13e0": "row 13, column 00, with plain white text.", + "1361": "row 13, column 00, with white underlined text.", + "13462": "row 13, column 00, with plain green text.", + "13e3": "row 13, column 00, with green underlined text.", + "1364": "row 13, column 00, with plain blue text.", + "13e5": "row 13, column 00, with blue underlined text.", + "13e6": "row 13, column 00, with plain cyan text.", + "1367": "row 13, column 00, with cyan underlined text.", + "1368": "row 13, column 00, with plain red text.", + "13e9": "row 13, column 00, with red underlined text.", + "13ea": "row 13, column 00, with plain yellow text.", + "136b": "row 13, column 00, with yellow underlined text.", + "13ec": "row 13, column 00, with plain magenta text.", + "136d": "row 13, column 00, with magenta underlined text.", + "136e": "row 13, column 00, with white italicized text.", + "13ef": "row 13, column 00, with white underlined italicized text.", + "1370": "row 13, column 00, with plain white text.", + "13f1": "row 13, column 00, with white underlined text.", + "13f2": "row 13, column 04, with plain white text.", + "1373": "row 13, column 04, with white underlined text.", + "13f4": "row 13, column 08, with plain white text.", + "1375": "row 13, column 08, with white underlined text.", + "1376": "row 13, column 12, with plain white text.", + "13f7": "row 13, column 12, with white underlined text.", + "13f8": "row 13, column 16, with plain white text.", + "1379": "row 13, column 16, with white underlined text.", + "137a": "row 13, column 20, with plain white text.", + "13fb": "row 13, column 20, with white underlined text.", + "13fc": "row 13, column 24, with plain white text.", + "13fd": "row 13, column 24, with white underlined text.", + "13fe": "row 13, column 28, with plain white text.", + "137f": "row 13, column 28, with white underlined text.", + "9440": "row 14, column 00, with plain white text.", + "94c1": "row 14, column 00, with white underlined text.", + "94c2": "row 14, column 00, with plain green text.", + "9443": "row 14, column 00, with green underlined text.", + "94c4": "row 14, column 00, with plain blue text.", + "9445": "row 14, column 00, with blue underlined text.", + "9446": "row 14, column 00, with plain cyan text.", + "94c7": "row 14, column 00, with cyan underlined text.", + "94c8": "row 14, column 00, with plain red text.", + "9449": "row 14, column 00, with red underlined text.", + "944a": "row 14, column 00, with plain yellow text.", + "94cb": "row 14, column 00, with yellow underlined text.", + "944c": "row 14, column 00, with plain magenta text.", + "94cd": "row 14, column 00, with magenta underlined text.", + "94ce": "row 14, column 00, with white italicized text.", + "944f": "row 14, column 00, with white underlined italicized text.", + "94d0": "row 14, column 00, with plain white text.", + "9451": "row 14, column 00, with white underlined text.", + "9452": "row 14, column 04, with plain white text.", + "94d3": "row 14, column 04, with white underlined text.", + "9454": "row 14, column 08, with plain white text.", + "94d5": "row 14, column 08, with white underlined text.", + "94d6": "row 14, column 12, with plain white text.", + "9457": "row 14, column 12, with white underlined text.", + "9458": "row 14, column 16, with plain white text.", + "94d9": "row 14, column 16, with white underlined text.", + "94da": "row 14, column 20, with plain white text.", + "945b": "row 14, column 20, with white underlined text.", + "94dc": "row 14, column 24, with plain white text.", + "945d": "row 14, column 24, with white underlined text.", + "945e": "row 14, column 28, with plain white text.", + "94df": "row 14, column 28, with white underlined text.", + "94e0": "row 15, column 00, with plain white text.", + "9461": "row 15, column 00, with white underlined text.", + "9462": "row 15, column 00, with plain green text.", + "94e3": "row 15, column 00, with green underlined text.", + "9464": "row 15, column 00, with plain blue text.", + "94e5": "row 15, column 00, with blue underlined text.", + "94e6": "row 15, column 00, with plain cyan text.", + "9467": "row 15, column 00, with cyan underlined text.", + "9468": "row 15, column 00, with plain red text.", + "94e9": "row 15, column 00, with red underlined text.", + "94ea": "row 15, column 00, with plain yellow text.", + "946b": "row 15, column 00, with yellow underlined text.", + "94ec": "row 15, column 00, with plain magenta text.", + "946d": "row 15, column 00, with magenta underlined text.", + "946e": "row 15, column 00, with white italicized text.", + "94ef": "row 15, column 00, with white underlined italicized text.", + "9470": "row 15, column 00, with plain white text.", + "94f1": "row 15, column 00, with white underlined text.", + "94f2": "row 15, column 04, with plain white text.", + "9473": "row 15, column 04, with white underlined text.", + "94f4": "row 15, column 08, with plain white text.", + "9475": "row 15, column 08, with white underlined text.", + "9476": "row 15, column 12, with plain white text.", + "94f7": "row 15, column 12, with white underlined text.", + "94f8": "row 15, column 16, with plain white text.", + "9479": "row 15, column 16, with white underlined text.", + "947a": "row 15, column 20, with plain white text.", + "94fb": "row 15, column 20, with white underlined text.", + "94fc": "row 15, column 24, with plain white text.", + "94fd": "row 15, column 24, with white underlined text.", + "94fe": "row 15, column 28, with plain white text.", + "947f": "row 15, column 28, with white underlined text.", + "97a1": "Tab Offset 1 column", + "97a2": "Tab Offset 2 columns", + "9723": "Tab Offset 3 columns", + "94a1": "BackSpace", + "94a4": "Delete to End of Row", + "94ad": "Carriage Return", + "1020": "Background White", + "10a1": "Background Semi-Transparent White", + "10a2": "Background Green", + "1023": "Background Semi-Transparent Green", + "10a4": "Background Blue", + "1025": "Background Semi-Transparent Blue", + "1026": "Background Cyan", + "10a7": "Background Semi-Transparent Cyan", + "10a8": "Background Red", + "1029": "Background Semi-Transparent Red", + "102a": "Background Yellow", + "10ab": "Background Semi-Transparent Yellow", + "102c": "Background Magenta", + "10ad": "Background Semi-Transparent Magenta", + "10ae": "Background Black", + "102f": "Background Semi-Transparent Black", + "97ad": "Background Transparent", + "97a4": "Standard Character Set", + "9725": "Double-Size Character Set", + "9726": "First Private Character Set", + "97a7": "Second Private Character Set", + "97a8": "People`s Republic of China Character Set", + "9729": "Korean Standard Character Set", + "972a": "First Registered Character Set", + "9120": "White Plain", + "91a1": "White Underline", + "91a2": "Green Plain", + "9123": "Green Underline", + "91a4": "Blue Plain", + "9125": "Blue Underline", + "9126": "Cyan Plain", + "91a7": "Cyan Underline", + "91a8": "Red Plain", + "9129": "Red Underline", + "912a": "Yellow Plain", + "91ab": "Yellow Underline", + "912c": "Magenta Plain", + "91ad": "Magenta Underline", + "97ae": "Black Plain", + "972f": "Black Underline", + "91ae": "Italics", + "912f": "Italics Underline", + "94a8": "Flash ON", + "9423": "Alarm Off", + "94a2": "Alarm On" +} + +SCC_STYLES = ["bold", "italic", "underline", "plain", "underlined italicized"] + +ITALICS_COMMANDS = { + key: COMMAND_LABELS[key] for key in COMMAND_LABELS if "italic" in COMMAND_LABELS[key].lower() +} + +UNDERLINE_COMMANDS = { + key: COMMAND_LABELS[key] for key in COMMAND_LABELS if + "italic" not in COMMAND_LABELS[key].lower() and + "underline" in COMMAND_LABELS[key].lower() +} + +PLAIN_TEXT_COMMANDS = { + key: COMMAND_LABELS[key] for key in COMMAND_LABELS if "plain" in COMMAND_LABELS[key].lower() +} + +STYLE_SETTING_COMMANDS = { + **ITALICS_COMMANDS, **UNDERLINE_COMMANDS, **PLAIN_TEXT_COMMANDS +} diff --git a/pycaption/scc/specialized_collections.py b/pycaption/scc/specialized_collections.py index 4b8800ed..1e8f12ee 100644 --- a/pycaption/scc/specialized_collections.py +++ b/pycaption/scc/specialized_collections.py @@ -1,14 +1,27 @@ import collections -from ..base import CaptionList, Caption, CaptionNode +from ..base import Caption, CaptionList, CaptionNode from ..geometry import ( - UnitEnum, Size, Layout, Point, Alignment, - VerticalAlignmentEnum, HorizontalAlignmentEnum + Alignment, + HorizontalAlignmentEnum, + Layout, + Point, + Size, + UnitEnum, + VerticalAlignmentEnum, ) from .constants import ( - PAC_BYTES_TO_POSITIONING_MAP, COMMANDS, PAC_TAB_OFFSET_COMMANDS, - MICROSECONDS_PER_CODEWORD, BACKGROUND_COLOR_CODES, - MID_ROW_CODES, EXTENDED_CHARS + BACKGROUND_COLOR_CODES, + COMMANDS, + EXTENDED_CHARS, + ITALICS_COMMANDS, + MICROSECONDS_PER_CODEWORD, + MID_ROW_CODES, + PAC_BYTES_TO_POSITIONING_MAP, + PAC_TAB_OFFSET_COMMANDS, + PLAIN_TEXT_COMMANDS, + STYLE_SETTING_COMMANDS, + UNDERLINE_COMMANDS, ) PopOnCue = collections.namedtuple("PopOnCue", "buffer, start, end") @@ -31,9 +44,7 @@ def __init__(self, start=0, end=0): self.layout_info = None def to_real_caption(self): - return Caption( - self.start, self.end, self.nodes, self.style, self.layout_info - ) + return Caption(self.start, self.end, self.nodes, self.style, self.layout_info) class TimingCorrectingCaptionList(list): @@ -45,6 +56,7 @@ class TimingCorrectingCaptionList(list): Also, doesn't allow Nones or empty captions """ + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._last_batch = () @@ -98,9 +110,10 @@ def _update_last_batch(batch, *new_captions): new_caption = new_captions[0] - if batch and (batch[-1].end == 0 - or new_caption.start - batch[-1].end - < 5 * MICROSECONDS_PER_CODEWORD + 1): + if batch and ( + batch[-1].end == 0 + or new_caption.start - batch[-1].end < 5 * MICROSECONDS_PER_CODEWORD + 1 + ): for caption in batch: caption.end = new_caption.start @@ -109,6 +122,7 @@ class NotifyingDict(dict): """Dictionary-like object, that treats one key as 'active', and notifies observers if the active key changed """ + # Need an unhashable object as initial value for the active key. # That way we're sure this was never a key in the dict. _guard = {} @@ -124,7 +138,7 @@ def set_active(self, key): :param key: any hashable object """ if key not in self: - raise ValueError('No such key present') + raise ValueError("No such key present") # Notify observers of the change if key != self.active_key: @@ -136,7 +150,7 @@ def set_active(self, key): def get_active(self): """Returns the value corresponding to the active key""" if self.active_key is self._guard: - raise KeyError('No active key set') + raise KeyError("No active key set") return self[self.active_key] @@ -150,13 +164,14 @@ def add_change_observer(self, observer): arguments """ if not callable(observer): - raise TypeError('The observer should be callable') + raise TypeError("The observer should be callable") self.observers.append(observer) class CaptionCreator: """Creates and maintains a collection of Captions""" + def __init__(self): self._collection = TimingCorrectingCaptionList() @@ -226,28 +241,31 @@ def create_and_store(self, node_buffer, start, end=0): # handle line breaks elif instruction.is_explicit_break(): - caption.nodes.append(CaptionNode.create_break( - layout_info=_get_layout_from_tuple(instruction.position) - )) + caption.nodes.append( + CaptionNode.create_break( + layout_info=_get_layout_from_tuple(instruction.position) + ) + ) # handle open italics elif instruction.sets_italics_on(): caption.nodes.append( CaptionNode.create_style( - True, {'italics': True}, - layout_info=_get_layout_from_tuple( - instruction.position - )) + True, + {"italics": True}, + layout_info=_get_layout_from_tuple(instruction.position), + ) ) # handle clone italics elif instruction.sets_italics_off(): caption.nodes.append( CaptionNode.create_style( - False, {'italics': True}, - layout_info=_get_layout_from_tuple( - instruction.position) - )) + False, + {"italics": True}, + layout_info=_get_layout_from_tuple(instruction.position), + ) + ) # handle text elif instruction.is_text_node(): @@ -256,7 +274,7 @@ def create_and_store(self, node_buffer, start, end=0): CaptionNode.create_text( text=instruction.text, layout_info=layout_info, - position=instruction.position + position=instruction.position, ) ) caption.layout_info = layout_info @@ -278,6 +296,7 @@ class InstructionNodeCreator: """Creates _InstructionNode instances from characters and commands, storing them internally """ + def __init__(self, collection=None, position_tracker=None): """ :param collection: an optional collection of nodes @@ -290,8 +309,9 @@ def __init__(self, collection=None, position_tracker=None): else: self._collection = collection - self.last_style = None - + self.last_style = ( + None # can be italic on or italic off as we only support italics + ) self._position_tracer = position_tracker def is_empty(self): @@ -309,8 +329,11 @@ def add_chars(self, *chars): current_position = self._position_tracer.get_current_position() # get or create a usable node - if (self._collection and self._collection[-1].is_text_node() - and not self._position_tracer.is_repositioning_required()): + if ( + self._collection + and self._collection[-1].is_text_node() + and not self._position_tracer.is_repositioning_required() + ): node = self._collection[-1] else: # create first node @@ -319,19 +342,21 @@ def add_chars(self, *chars): # handle a simple line break if self._position_tracer.is_linebreak_required(): - # must insert a line break here - self._collection.append(_InstructionNode.create_break( - position=current_position)) + self._collection.append( + _InstructionNode.create_break(position=current_position) + ) + self._position_tracer.acknowledge_linebreak_consumed() node = _InstructionNode.create_text(current_position) self._collection.append(node) - self._position_tracer.acknowledge_linebreak_consumed() + if self._position_tracer.is_repositioning_required(): + # it means we have a reposition command which was not followed by + # any text, so we just ignore it and break + self._position_tracer.acknowledge_position_changed() # handle completely new positioning elif self._position_tracer.is_repositioning_required(): self._collection.append( - _InstructionNode.create_repositioning_command( - current_position - ) + _InstructionNode.create_repositioning_command(current_position) ) node = _InstructionNode.create_text(current_position) self._collection.append(node) @@ -339,20 +364,29 @@ def add_chars(self, *chars): node.add_chars(*chars) - def interpret_command(self, command, previous_is_pac_or_tab=False): + @staticmethod + def get_style_for_command(command): + if command in ITALICS_COMMANDS: + return "italic" + elif command in UNDERLINE_COMMANDS: + return "underline" + else: + # as we only check STYLE_SETTING_COMMANDS, + # only remaining possibility is plain text + return "plaintext" + + def interpret_command(self, command, next_command=None): """Given a command determines whether to turn italics on or off, or to set the positioning This is mostly used to convert from the legacy-style commands :type command: str - :type previous_is_pac_or_tab: previous command code is for a PAC command or a PAC_TAB_OFFSET_COMMANDS + :type next_command: the command that follows next """ self._update_positioning(command) - text = COMMANDS.get(command, '') - if command == "94a1": self.handle_backspace("94a1") @@ -362,41 +396,75 @@ def interpret_command(self, command, previous_is_pac_or_tab=False): # which will be deleted when the code is applied. # ex: 2080 97ad 94a1 if ( - self._collection[-1].is_text_node() and - self._collection[-1].text[-1].isspace() + len(self._collection) > 0 + and self._collection[-1].is_text_node() + and self._collection[-1].text[-1].isspace() ): self._collection[-1].text = self._collection[-1].text[:-1] - if 'italic' in text: - if self._position_tracer.is_linebreak_required(): - self._collection.append(_InstructionNode.create_break( - position=self._position_tracer.get_current_position())) - self._position_tracer.acknowledge_linebreak_consumed() - if 'end' not in text: - self._collection.append( - _InstructionNode.create_italics_style( - self._position_tracer.get_current_position()) - ) - self.last_style = "italics on" + if command in STYLE_SETTING_COMMANDS: + current_position = self._position_tracer.get_current_position() + # which style is command setting + command_style = self.get_style_for_command(command) + if command_style == "italic": + if self.last_style is None or self.last_style == "italics off": + # if we don't have any style yet, or we have a closed italics tag + # it should open italic tag + # if break is required, break then add style tag + if self._position_tracer.is_linebreak_required(): + self._collection.append( + _InstructionNode.create_break(position=current_position) + ) + self._position_tracer.acknowledge_linebreak_consumed() + self._collection.append( + _InstructionNode.create_italics_style(current_position) + ) + self.last_style = "italics on" else: - self._collection.append( - _InstructionNode.create_italics_style( - self._position_tracer.get_current_position(), - turn_on=False + # command sets a different style (underline, plain) + # so we need to close italics if we have an open italics tag + # otherwise we ignore it + # if break is required,add style tag then break + if self.last_style == "italics on": + self._collection.append( + _InstructionNode.create_italics_style( + self._position_tracer.get_current_position(), turn_on=False + ) ) - ) - self.last_style = "italics off" - - # mid row code that is not first code on the line - # (previous node is not a break node) - if command in MID_ROW_CODES and not previous_is_pac_or_tab: + self.last_style = "italics off" + if self._position_tracer.is_linebreak_required(): + self._collection.append( + _InstructionNode.create_break(position=current_position) + ) + self._position_tracer.acknowledge_linebreak_consumed() + + # handle mid-row codes that follows a text node + # don't add space if the next command adds one of + # ['.', '!', '?', ','] + punctuation = ["ae", "a1", "bf", "2c"] + next_is_punctuation = next_command and next_command[:2] in punctuation + prev_text_node = self.get_previous_text_node() + prev_node_is_break = prev_text_node is not None and any( + x.is_explicit_break() + for x in self._collection[self._collection.index(prev_text_node) :] + ) + if ( + command in MID_ROW_CODES + and prev_text_node + and not prev_node_is_break + and not prev_text_node.text[-1].isspace() + and command not in PAC_TAB_OFFSET_COMMANDS + and not next_is_punctuation + ): if self.last_style == "italics off": - self.add_chars(' ') + # need to open italics tag, add a space + # to the beginning of the next text node + self.add_chars(" ") else: - for node in self._collection[::-1]: - if node.is_text_node() and node.text: - node.text += ' ' - break + # italics on + # need to close italics tag, add a space + # to the end of the previous text node + prev_text_node.text = prev_text_node.text + " " def _update_positioning(self, command): """Sets the positioning information to use for the next nodes @@ -404,16 +472,16 @@ def _update_positioning(self, command): :type command: str """ if command in PAC_TAB_OFFSET_COMMANDS: - tab_offset = PAC_TAB_OFFSET_COMMANDS[command] prev_positioning = self._position_tracer.default - positioning = (prev_positioning[0], - prev_positioning[1] + tab_offset) + tab_offset = PAC_TAB_OFFSET_COMMANDS[command] + positioning = (prev_positioning[0], prev_positioning[1] + tab_offset) else: first, second = command[:2], command[2:] - try: + # is PAC positioning = PAC_BYTES_TO_POSITIONING_MAP[first][second] except KeyError: + # if not PAC or OFFSET we're not changing position return self._position_tracer.update_positioning(positioning) @@ -444,7 +512,7 @@ def from_list(cls, stash_list, position_tracker): # use space to separate the stashes, but don't add final space if idx < len(stash_list) - 1: try: - instance._collection[-1].add_chars(' ') + instance._collection[-1].add_chars(" ") except AttributeError: pass @@ -462,9 +530,8 @@ def handle_backspace(self, word): return last_char = node.text[-1] delete_previous_condition = ( - (word in EXTENDED_CHARS and last_char not in EXTENDED_CHARS.values()) or - word == "94a1" - ) + word in EXTENDED_CHARS and last_char not in EXTENDED_CHARS.values() + ) or word == "94a1" # in case extended char, perform backspace # only if the previous character in not also extended if delete_previous_condition: @@ -496,10 +563,10 @@ def _get_layout_from_tuple(position_tuple): horizontal = Size(80 * column / 32.0 + 10, UnitEnum.PERCENT) # Vertical safe area between 5% and 95% vertical = Size(90 * (row - 1) / 15.0 + 5, UnitEnum.PERCENT) - return Layout(origin=Point(horizontal, vertical), - alignment=Alignment(HorizontalAlignmentEnum.LEFT, - VerticalAlignmentEnum.TOP) - ) + return Layout( + origin=Point(horizontal, vertical), + alignment=Alignment(HorizontalAlignmentEnum.LEFT, VerticalAlignmentEnum.TOP), + ) class _InstructionNode: @@ -509,6 +576,7 @@ class _InstructionNode: These nodes will be aggregated into a RepresentableNode, which will then be easily converted to a CaptionNode. """ + TEXT = 0 BREAK = 1 ITALICS_ON = 2 @@ -533,9 +601,9 @@ def add_chars(self, *args): :return: """ if self.text is None: - self.text = '' + self.text = "" - self.text += ''.join(args) + self.text += "".join(args) def is_text_node(self): """ @@ -585,7 +653,7 @@ def requires_repositioning(self): def get_text(self): """A little legacy code.""" - return ' '.join(self.text.split()) + return " ".join(self.text.split()) @classmethod def create_break(cls, position): @@ -610,7 +678,7 @@ def create_text(cls, position, *chars): :rtype: _InstructionNode """ - return cls(''.join(chars), position=position) + return cls("".join(chars), position=position) @classmethod def create_italics_style(cls, position, turn_on=True): @@ -625,8 +693,7 @@ def create_italics_style(cls, position, turn_on=True): :rtype: _InstructionNode """ return cls( - position=position, - type_=cls.ITALICS_ON if turn_on else cls.ITALICS_OFF + position=position, type_=cls.ITALICS_ON if turn_on else cls.ITALICS_OFF ) @classmethod @@ -638,19 +705,19 @@ def create_repositioning_command(cls, position=None): """ return cls(type_=cls.CHANGE_POSITION, position=position) - def __repr__(self): # pragma: no cover + def __repr__(self): # pragma: no cover if self._type == self.BREAK: - extra = 'BR' + extra = "BR" elif self._type == self.TEXT: extra = f'"{self.text}"' elif self._type in (self.ITALICS_ON, self.ITALICS_OFF): - extra = 'italics {}'.format( - 'on' if self._type == self.ITALICS_ON else 'off' + extra = "italics {}".format( + "on" if self._type == self.ITALICS_ON else "off" ) else: - extra = 'change position' + extra = "change position" - return f'' + return f"" def _format_italics(collection): @@ -689,9 +756,27 @@ def _format_italics(collection): # removes pairs of italics nodes that don't do anything noticeable new_collection = _remove_noop_italics(new_collection) + # remove spaces to the end of the lines + new_collection = _remove_spaces_at_end_of_the_line(new_collection) + return new_collection +def _remove_spaces_at_end_of_the_line(collection): + for idx, node in enumerate(collection): + if ( + idx > 0 + and node._type == _InstructionNode.BREAK + and collection[idx - 1].is_text_node() + and collection[idx - 1].text + ): + collection[idx - 1].text = collection[idx - 1].text.rstrip() + # handle last node + if collection[-1].is_text_node(): + collection[-1].text = collection[-1].text.rstrip() + return collection + + def _remove_noop_on_off_italics(collection): """Return an equivalent list to `collection`. It removes the italics node pairs that don't surround text nodes, if those nodes are in the order: @@ -798,8 +883,9 @@ def _skip_empty_text_nodes(collection): :type collection: list[_InstructionNode] :rtype: list[_InstructionNode] """ - return [node for node in collection - if not (node.is_text_node() and node.is_empty())] + return [ + node for node in collection if not (node.is_text_node() and node.is_empty()) + ] def _skip_redundant_italics_nodes(collection): @@ -817,7 +903,8 @@ def _skip_redundant_italics_nodes(collection): if node.is_italics_node(): if state is None: state = node.sets_italics_on() - new_collection.append(node) + if node.sets_italics_on(): + new_collection.append(node) continue # skip the nodes that are like the previous if node.sets_italics_on() is state: @@ -831,19 +918,19 @@ def _skip_redundant_italics_nodes(collection): def _close_italics_before_repositioning(collection): """Make sure that for every opened italic node, there's a corresponding - closing node. + closing node. - Will insert a closing italic node, before each repositioning node + Will insert a closing italic node, before each repositioning node - :type collection: list[_InstructionNode] - :rtype: list[_InstructionNode] + :type collection: list[_InstructionNode] + :rtype: list[_InstructionNode] """ new_collection = [] italics_on = False last_italics_on_node = None - for idx, node in enumerate(collection): + for node in collection: if node.is_italics_node() and node.sets_italics_on(): italics_on = True last_italics_on_node = node @@ -855,17 +942,16 @@ def _close_italics_before_repositioning(collection): _InstructionNode.create_italics_style( # The position info of this new node should be the same position=last_italics_on_node.position, - turn_on=False + turn_on=False, ) ) new_collection.append(node) # Append an italics opening node after the positioning change new_collection.append( - _InstructionNode.create_italics_style( - position=node.position - ) + _InstructionNode.create_italics_style(position=node.position) ) continue + new_collection.append(node) return new_collection @@ -892,8 +978,7 @@ def _ensure_final_italics_node_closes(collection): if italics_on: new_collection.append( _InstructionNode.create_italics_style( - position=last_italics_on_node.position, - turn_on=False + position=last_italics_on_node.position, turn_on=False ) ) return new_collection diff --git a/pycaption/scc/state_machines.py b/pycaption/scc/state_machines.py index 04fc632b..af5cd537 100644 --- a/pycaption/scc/state_machines.py +++ b/pycaption/scc/state_machines.py @@ -5,6 +5,7 @@ class _PositioningTracker: """Helps determine the positioning of a node, having kept track of positioning-related commands. """ + def __init__(self, positioning=None): """ :param positioning: positioning information (row, column) @@ -39,10 +40,9 @@ def update_positioning(self, positioning): col = self._last_column new_row, new_col = positioning is_tab_offset = new_row == row and col + 1 <= new_col <= col + 3 - # One line below will be treated as line break, not repositioning if new_row == row + 1: - self._positions.append((new_row, col)) + self._positions.append((new_row, new_col)) self._break_required = True self._last_column = new_col # Tab offsets after line breaks will be ignored to avoid repositioning @@ -64,9 +64,7 @@ def get_current_position(self): :raise: CaptionReadSyntaxError """ if not any(self._positions): - raise CaptionReadSyntaxError( - 'No Preamble Address Code [PAC] was provided' - ) + raise CaptionReadSyntaxError("No Preamble Address Code [PAC] was provided") else: return self._positions[0] @@ -97,6 +95,7 @@ class DefaultProvidingPositionTracker(_PositioningTracker): """A _PositioningTracker that provides if needed a default value (14, 0), or uses the last positioning value set anywhere in the document """ + default = (14, 0) def __init__(self, positioning=None, default=None): diff --git a/pycaption/scc/translator.py b/pycaption/scc/translator.py index 88fc36e0..aba7f8a2 100644 --- a/pycaption/scc/translator.py +++ b/pycaption/scc/translator.py @@ -1,556 +1,11 @@ -from pycaption.scc.constants import CHARACTERS, SPECIAL_CHARS, EXTENDED_CHARS - -ALL_CHARACTERS = {**CHARACTERS, **SPECIAL_CHARS, **EXTENDED_CHARS} -COMMAND_LABELS = { - "9420": "Resume Caption Loading", - "9429": "Resume Direct Captioning", - "9425": "Roll-Up Captions--2 Rows", - "9426": "Roll-Up Captions--3 Rows", - "94a7": "Roll-Up Captions--4 Rows", - "942a": "Text Restart", - "94ab": "Resume Text Display", - "942c": "Erase Displayed Memory", - "94ae": "Erase Non-displayed Memory", - "942f": "End Of Caption", - "9140": "row 01, column 00, with plain white text.", - "91c1": "row 01, column 00, with white underlined text.", - "91c2": "row 01, column 00, with plain green text.", - "9143": "row 01, column 00, with green underlined text.", - "91c4": "row 01, column 00, with plain blue text.", - "9145": "row 01, column 00, with blue underlined text.", - "9146": "row 01, column 00, with plain cyan text.", - "91c7": "row 01, column 00, with cyan underlined text.", - "91c8": "row 01, column 00, with plain red text.", - "9149": "row 01, column 00, with red underlined text.", - "914a": "row 01, column 00, with plain yellow text.", - "91cb": "row 01, column 00, with yellow underlined text.", - "914c": "row 01, column 00, with plain magenta text.", - "91cd": "row 01, column 00, with magenta underlined text.", - "91ce": "row 01, column 00, with white italicized text.", - "914f": "row 01, column 00, with white underlined italicized text.", - "91d0": "row 01, column 00, with plain white text.", - "9151": "row 01, column 00, with white underlined text.", - "9152": "row 01, column 04, with plain white text.", - "91d3": "row 01, column 04, with white underlined text.", - "9154": "row 01, column 08, with plain white text.", - "91d5": "row 01, column 08, with white underlined text.", - "91d6": "row 01, column 12, with plain white text.", - "9157": "row 01, column 12, with white underlined text.", - "9158": "row 01, column 16, with plain white text.", - "91d9": "row 01, column 16, with white underlined text.", - "91da": "row 01, column 20, with plain white text.", - "915b": "row 01, column 20, with white underlined text.", - "91dc": "row 01, column 24, with plain white text.", - "915d": "row 01, column 24, with white underlined text.", - "915e": "row 01, column 28, with plain white text.", - "91df": "row 01, column 28, with white underlined text.", - "91e0": "row 02, column 00, with plain white text.", - "9161": "row 02, column 00, with white underlined text.", - "9162": "row 02, column 00, with plain green text.", - "91e3": "row 02, column 00, with green underlined text.", - "9164": "row 02, column 00, with plain blue text.", - "91e5": "row 02, column 00, with blue underlined text.", - "91e6": "row 02, column 00, with plain cyan text.", - "9167": "row 02, column 00, with cyan underlined text.", - "9168": "row 02, column 00, with plain red text.", - "91e9": "row 02, column 00, with red underlined text.", - "91ea": "row 02, column 00, with plain yellow text.", - "916b": "row 02, column 00, with yellow underlined text.", - "91ec": "row 02, column 00, with plain magenta text.", - "916d": "row 02, column 00, with magenta underlined text.", - "916e": "row 02, column 00, with white italicized text.", - "91ef": "row 02, column 00, with white underlined italicized text.", - "9170": "row 02, column 00, with plain white text.", - "91f1": "row 02, column 00, with white underlined text.", - "91f2": "row 02, column 04, with plain white text.", - "9173": "row 02, column 04, with white underlined text.", - "91f4": "row 02, column 08, with plain white text.", - "9175": "row 02, column 08, with white underlined text.", - "9176": "row 02, column 12, with plain white text.", - "91f7": "row 02, column 12, with white underlined text.", - "91f8": "row 02, column 16, with plain white text.", - "9179": "row 02, column 16, with white underlined text.", - "917a": "row 02, column 20, with plain white text.", - "91fb": "row 02, column 20, with white underlined text.", - "91fc": "row 02, column 24, with plain white text.", - "91fd": "row 02, column 24, with white underlined text.", - "91fe": "row 02, column 28, with plain white text.", - "917f": "row 02, column 28, with white underlined text.", - "9240": "row 03, column 00, with plain white text.", - "92c1": "row 03, column 00, with white underlined text.", - "92c2": "row 03, column 00, with plain green text.", - "9243": "row 03, column 00, with green underlined text.", - "92c4": "row 03, column 00, with plain blue text.", - "9245": "row 03, column 00, with blue underlined text.", - "9246": "row 03, column 00, with plain cyan text.", - "92c7": "row 03, column 00, with cyan underlined text.", - "92c8": "row 03, column 00, with plain red text.", - "9249": "row 03, column 00, with red underlined text.", - "924a": "row 03, column 00, with plain yellow text.", - "92cb": "row 03, column 00, with yellow underlined text.", - "924c": "row 03, column 00, with plain magenta text.", - "92cd": "row 03, column 00, with magenta underlined text.", - "92ce": "row 03, column 00, with white italicized text.", - "924f": "row 03, column 00, with white underlined italicized text.", - "92d0": "row 03, column 00, with plain white text.", - "9251": "row 03, column 00, with white underlined text.", - "9252": "row 03, column 04, with plain white text.", - "92d3": "row 03, column 04, with white underlined text.", - "9254": "row 03, column 08, with plain white text.", - "92d5": "row 03, column 08, with white underlined text.", - "92d6": "row 03, column 12, with plain white text.", - "9257": "row 03, column 12, with white underlined text.", - "9258": "row 03, column 16, with plain white text.", - "92d9": "row 03, column 16, with white underlined text.", - "92da": "row 03, column 20, with plain white text.", - "925b": "row 03, column 20, with white underlined text.", - "92dc": "row 03, column 24, with plain white text.", - "925d": "row 03, column 24, with white underlined text.", - "925e": "row 03, column 28, with plain white text.", - "92df": "row 03, column 28, with white underlined text.", - "92e0": "row 04, column 00, with plain white text.", - "9261": "row 04, column 00, with white underlined text.", - "9262": "row 04, column 00, with plain green text.", - "92e3": "row 04, column 00, with green underlined text.", - "9264": "row 04, column 00, with plain blue text.", - "92e5": "row 04, column 00, with blue underlined text.", - "92e6": "row 04, column 00, with plain cyan text.", - "9267": "row 04, column 00, with cyan underlined text.", - "9268": "row 04, column 00, with plain red text.", - "92e9": "row 04, column 00, with red underlined text.", - "92ea": "row 04, column 00, with plain yellow text.", - "926b": "row 04, column 00, with yellow underlined text.", - "92ec": "row 04, column 00, with plain magenta text.", - "926d": "row 04, column 00, with magenta underlined text.", - "926e": "row 04, column 00, with white italicized text.", - "92ef": "row 04, column 00, with white underlined italicized text.", - "9270": "row 04, column 00, with plain white text.", - "92f1": "row 04, column 00, with white underlined text.", - "92f2": "row 04, column 04, with plain white text.", - "9273": "row 04, column 04, with white underlined text.", - "92f4": "row 04, column 08, with plain white text.", - "9275": "row 04, column 08, with white underlined text.", - "9276": "row 04, column 12, with plain white text.", - "92f7": "row 04, column 12, with white underlined text.", - "92f8": "row 04, column 16, with plain white text.", - "9279": "row 04, column 16, with white underlined text.", - "927a": "row 04, column 20, with plain white text.", - "92fb": "row 04, column 20, with white underlined text.", - "92fc": "row 04, column 24, with plain white text.", - "92fd": "row 04, column 24, with white underlined text.", - "92fe": "row 04, column 28, with plain white text.", - "927f": "row 04, column 28, with white underlined text.", - "1540": "row 05, column 00, with plain white text.", - "15c1": "row 05, column 00, with white underlined text.", - "15c2": "row 05, column 00, with plain green text.", - "1543": "row 05, column 00, with green underlined text.", - "15c4": "row 05, column 00, with plain blue text.", - "1545": "row 05, column 00, with blue underlined text.", - "1546": "row 05, column 00, with plain cyan text.", - "15c7": "row 05, column 00, with cyan underlined text.", - "15c8": "row 05, column 00, with plain red text.", - "1549": "row 05, column 00, with red underlined text.", - "154a": "row 05, column 00, with plain yellow text.", - "15cb": "row 05, column 00, with yellow underlined text.", - "154c": "row 05, column 00, with plain magenta text.", - "15cd": "row 05, column 00, with magenta underlined text.", - "15ce": "row 05, column 00, with white italicized text.", - "154f": "row 05, column 00, with white underlined italicized text.", - "15d0": "row 05, column 00, with plain white text.", - "1551": "row 05, column 00, with white underlined text.", - "1552": "row 05, column 04, with plain white text.", - "15d3": "row 05, column 04, with white underlined text.", - "1554": "row 05, column 08, with plain white text.", - "15d5": "row 05, column 08, with white underlined text.", - "15d6": "row 05, column 12, with plain white text.", - "1557": "row 05, column 12, with white underlined text.", - "1558": "row 05, column 16, with plain white text.", - "15d9": "row 05, column 16, with white underlined text.", - "15da": "row 05, column 20, with plain white text.", - "155b": "row 05, column 20, with white underlined text.", - "15dc": "row 05, column 24, with plain white text.", - "155d": "row 05, column 24, with white underlined text.", - "155e": "row 05, column 28, with plain white text.", - "15df": "row 05, column 28, with white underlined text.", - "15e0": "row 06, column 00, with plain white text.", - "1561": "row 06, column 00, with white underlined text.", - "15462": "row 06, column 00, with plain green text.", - "15e3": "row 06, column 00, with green underlined text.", - "1564": "row 06, column 00, with plain blue text.", - "15e5": "row 06, column 00, with blue underlined text.", - "15e6": "row 06, column 00, with plain cyan text.", - "1567": "row 06, column 00, with cyan underlined text.", - "1568": "row 06, column 00, with plain red text.", - "15e9": "row 06, column 00, with red underlined text.", - "15ea": "row 06, column 00, with plain yellow text.", - "156b": "row 06, column 00, with yellow underlined text.", - "15ec": "row 06, column 00, with plain magenta text.", - "156d": "row 06, column 00, with magenta underlined text.", - "156e": "row 06, column 00, with white italicized text.", - "15ef": "row 06, column 00, with white underlined italicized text.", - "1570": "row 06, column 00, with plain white text.", - "15f1": "row 06, column 00, with white underlined text.", - "15f2": "row 06, column 04, with plain white text.", - "1573": "row 06, column 04, with white underlined text.", - "15f4": "row 06, column 08, with plain white text.", - "1575": "row 06, column 08, with white underlined text.", - "1576": "row 06, column 12, with plain white text.", - "15f7": "row 06, column 12, with white underlined text.", - "15f8": "row 06, column 16, with plain white text.", - "1579": "row 06, column 16, with white underlined text.", - "157a": "row 06, column 20, with plain white text.", - "15fb": "row 06, column 20, with white underlined text.", - "15fc": "row 06, column 24, with plain white text.", - "15fd": "row 06, column 24, with white underlined text.", - "15fe": "row 06, column 28, with plain white text.", - "157f": "row 06, column 28, with white underlined text.", - "1640": "row 07, column 00, with plain white text.", - "16c1": "row 07, column 00, with white underlined text.", - "16c2": "row 07, column 00, with plain green text.", - "1643": "row 07, column 00, with green underlined text.", - "16c4": "row 07, column 00, with plain blue text.", - "1645": "row 07, column 00, with blue underlined text.", - "1646": "row 07, column 00, with plain cyan text.", - "16c7": "row 07, column 00, with cyan underlined text.", - "16c8": "row 07, column 00, with plain red text.", - "1649": "row 07, column 00, with red underlined text.", - "164a": "row 07, column 00, with plain yellow text.", - "16cb": "row 07, column 00, with yellow underlined text.", - "164c": "row 07, column 00, with plain magenta text.", - "16cd": "row 07, column 00, with magenta underlined text.", - "16ce": "row 07, column 00, with white italicized text.", - "164f": "row 07, column 00, with white underlined italicized text.", - "16d0": "row 07, column 00, with plain white text.", - "1651": "row 07, column 00, with white underlined text.", - "1652": "row 07, column 04, with plain white text.", - "16d3": "row 07, column 04, with white underlined text.", - "1654": "row 07, column 08, with plain white text.", - "16d5": "row 07, column 08, with white underlined text.", - "16d6": "row 07, column 12, with plain white text.", - "1657": "row 07, column 12, with white underlined text.", - "1658": "row 07, column 16, with plain white text.", - "16d9": "row 07, column 16, with white underlined text.", - "16da": "row 07, column 20, with plain white text.", - "165b": "row 07, column 20, with white underlined text.", - "16dc": "row 07, column 24, with plain white text.", - "165d": "row 07, column 24, with white underlined text.", - "165e": "row 07, column 28, with plain white text.", - "16df": "row 07, column 28, with white underlined text.", - "16e0": "row 08, column 00, with plain white text.", - "1661": "row 08, column 00, with white underlined text.", - "16462": "row 08, column 00, with plain green text.", - "16e3": "row 08, column 00, with green underlined text.", - "1664": "row 08, column 00, with plain blue text.", - "16e5": "row 08, column 00, with blue underlined text.", - "16e6": "row 08, column 00, with plain cyan text.", - "1667": "row 08, column 00, with cyan underlined text.", - "1668": "row 08, column 00, with plain red text.", - "16e9": "row 08, column 00, with red underlined text.", - "16ea": "row 08, column 00, with plain yellow text.", - "166b": "row 08, column 00, with yellow underlined text.", - "16ec": "row 08, column 00, with plain magenta text.", - "166d": "row 08, column 00, with magenta underlined text.", - "166e": "row 08, column 00, with white italicized text.", - "16ef": "row 08, column 00, with white underlined italicized text.", - "1670": "row 08, column 00, with plain white text.", - "16f1": "row 08, column 00, with white underlined text.", - "16f2": "row 08, column 04, with plain white text.", - "1673": "row 08, column 04, with white underlined text.", - "16f4": "row 08, column 08, with plain white text.", - "1675": "row 08, column 08, with white underlined text.", - "1676": "row 08, column 12, with plain white text.", - "16f7": "row 08, column 12, with white underlined text.", - "16f8": "row 08, column 16, with plain white text.", - "1679": "row 08, column 16, with white underlined text.", - "167a": "row 08, column 20, with plain white text.", - "16fb": "row 08, column 20, with white underlined text.", - "16fc": "row 08, column 24, with plain white text.", - "16fd": "row 08, column 24, with white underlined text.", - "16fe": "row 08, column 28, with plain white text.", - "167f": "row 08, column 28, with white underlined text.", - "9740": "row 09, column 00, with plain white text.", - "97c1": "row 09, column 00, with white underlined text.", - "97c2": "row 09, column 00, with plain green text.", - "9743": "row 09, column 00, with green underlined text.", - "97c4": "row 09, column 00, with plain blue text.", - "9745": "row 09, column 00, with blue underlined text.", - "9746": "row 09, column 00, with plain cyan text.", - "97c7": "row 09, column 00, with cyan underlined text.", - "97c8": "row 09, column 00, with plain red text.", - "9749": "row 09, column 00, with red underlined text.", - "974a": "row 09, column 00, with plain yellow text.", - "97cb": "row 09, column 00, with yellow underlined text.", - "974c": "row 09, column 00, with plain magenta text.", - "97cd": "row 09, column 00, with magenta underlined text.", - "97ce": "row 09, column 00, with white italicized text.", - "974f": "row 09, column 00, with white underlined italicized text.", - "97d0": "row 09, column 00, with plain white text.", - "9751": "row 09, column 00, with white underlined text.", - "9752": "row 09, column 04, with plain white text.", - "97d3": "row 09, column 04, with white underlined text.", - "9754": "row 09, column 08, with plain white text.", - "97d5": "row 09, column 08, with white underlined text.", - "97d6": "row 09, column 12, with plain white text.", - "9757": "row 09, column 12, with white underlined text.", - "9758": "row 09, column 16, with plain white text.", - "97d9": "row 09, column 16, with white underlined text.", - "97da": "row 09, column 20, with plain white text.", - "975b": "row 09, column 20, with white underlined text.", - "97dc": "row 09, column 24, with plain white text.", - "975d": "row 09, column 24, with white underlined text.", - "975e": "row 09, column 28, with plain white text.", - "97df": "row 09, column 28, with white underlined text.", - "97e0": "row 10, column 00, with plain white text.", - "9761": "row 10, column 00, with white underlined text.", - "9762": "row 10, column 00, with plain green text.", - "97e3": "row 10, column 00, with green underlined text.", - "9764": "row 10, column 00, with plain blue text.", - "97e5": "row 10, column 00, with blue underlined text.", - "97e6": "row 10, column 00, with plain cyan text.", - "9767": "row 10, column 00, with cyan underlined text.", - "9768": "row 10, column 00, with plain red text.", - "97e9": "row 10, column 00, with red underlined text.", - "97ea": "row 10, column 00, with plain yellow text.", - "976b": "row 10, column 00, with yellow underlined text.", - "97ec": "row 10, column 00, with plain magenta text.", - "976d": "row 10, column 00, with magenta underlined text.", - "976e": "row 10, column 00, with white italicized text.", - "97ef": "row 10, column 00, with white underlined italicized text.", - "9770": "row 10, column 00, with plain white text.", - "97f1": "row 10, column 00, with white underlined text.", - "97f2": "row 10, column 04, with plain white text.", - "9773": "row 10, column 04, with white underlined text.", - "97f4": "row 10, column 08, with plain white text.", - "9775": "row 10, column 08, with white underlined text.", - "9776": "row 10, column 12, with plain white text.", - "97f7": "row 10, column 12, with white underlined text.", - "97f8": "row 10, column 16, with plain white text.", - "9779": "row 10, column 16, with white underlined text.", - "977a": "row 10, column 20, with plain white text.", - "97fb": "row 10, column 20, with white underlined text.", - "97fc": "row 10, column 24, with plain white text.", - "97fd": "row 10, column 24, with white underlined text.", - "97fe": "row 10, column 28, with plain white text.", - "977f": "row 10, column 28, with white underlined text.", - "1040": "row 11, column 00, with plain white text.", - "10c1": "row 11, column 00, with white underlined text.", - "10c2": "row 11, column 00, with plain green text.", - "1043": "row 11, column 00, with green underlined text.", - "10c4": "row 11, column 00, with plain blue text.", - "1045": "row 11, column 00, with blue underlined text.", - "1046": "row 11, column 00, with plain cyan text.", - "10c7": "row 11, column 00, with cyan underlined text.", - "10c8": "row 11, column 00, with plain red text.", - "1049": "row 11, column 00, with red underlined text.", - "104a": "row 11, column 00, with plain yellow text.", - "10cb": "row 11, column 00, with yellow underlined text.", - "104c": "row 11, column 00, with plain magenta text.", - "10cd": "row 11, column 00, with magenta underlined text.", - "10ce": "row 11, column 00, with white italicized text.", - "104f": "row 11, column 00, with white underlined italicized text.", - "10d0": "row 11, column 00, with plain white text.", - "1051": "row 11, column 00, with white underlined text.", - "1052": "row 11, column 04, with plain white text.", - "10d3": "row 11, column 04, with white underlined text.", - "1054": "row 11, column 08, with plain white text.", - "10d5": "row 11, column 08, with white underlined text.", - "10d6": "row 11, column 12, with plain white text.", - "1057": "row 11, column 12, with white underlined text.", - "1058": "row 11, column 16, with plain white text.", - "10d9": "row 11, column 16, with white underlined text.", - "10da": "row 11, column 20, with plain white text.", - "105b": "row 11, column 20, with white underlined text.", - "10dc": "row 11, column 24, with plain white text.", - "105d": "row 11, column 24, with white underlined text.", - "105e": "row 11, column 28, with plain white text.", - "10df": "row 11, column 28, with white underlined text.", - "1340": "row 12, column 00, with plain white text.", - "13c1": "row 12, column 00, with white underlined text.", - "13c2": "row 12, column 00, with plain green text.", - "1343": "row 12, column 00, with green underlined text.", - "13c4": "row 12, column 00, with plain blue text.", - "1345": "row 12, column 00, with blue underlined text.", - "1346": "row 12, column 00, with plain cyan text.", - "13c7": "row 12, column 00, with cyan underlined text.", - "13c8": "row 12, column 00, with plain red text.", - "1349": "row 12, column 00, with red underlined text.", - "134a": "row 12, column 00, with plain yellow text.", - "13cb": "row 12, column 00, with yellow underlined text.", - "134c": "row 12, column 00, with plain magenta text.", - "13cd": "row 12, column 00, with magenta underlined text.", - "13ce": "row 12, column 00, with white italicized text.", - "134f": "row 12, column 00, with white underlined italicized text.", - "13d0": "row 12, column 00, with plain white text.", - "1351": "row 12, column 00, with white underlined text.", - "1352": "row 12, column 04, with plain white text.", - "13d3": "row 12, column 04, with white underlined text.", - "1354": "row 12, column 08, with plain white text.", - "13d5": "row 12, column 08, with white underlined text.", - "13d6": "row 12, column 12, with plain white text.", - "1357": "row 12, column 12, with white underlined text.", - "1358": "row 12, column 16, with plain white text.", - "13d9": "row 12, column 16, with white underlined text.", - "13da": "row 12, column 20, with plain white text.", - "135b": "row 12, column 20, with white underlined text.", - "13dc": "row 12, column 24, with plain white text.", - "135d": "row 12, column 24, with white underlined text.", - "135e": "row 12, column 28, with plain white text.", - "13df": "row 12, column 28, with white underlined text.", - "13e0": "row 13, column 00, with plain white text.", - "1361": "row 13, column 00, with white underlined text.", - "13462": "row 13, column 00, with plain green text.", - "13e3": "row 13, column 00, with green underlined text.", - "1364": "row 13, column 00, with plain blue text.", - "13e5": "row 13, column 00, with blue underlined text.", - "13e6": "row 13, column 00, with plain cyan text.", - "1367": "row 13, column 00, with cyan underlined text.", - "1368": "row 13, column 00, with plain red text.", - "13e9": "row 13, column 00, with red underlined text.", - "13ea": "row 13, column 00, with plain yellow text.", - "136b": "row 13, column 00, with yellow underlined text.", - "13ec": "row 13, column 00, with plain magenta text.", - "136d": "row 13, column 00, with magenta underlined text.", - "136e": "row 13, column 00, with white italicized text.", - "13ef": "row 13, column 00, with white underlined italicized text.", - "1370": "row 13, column 00, with plain white text.", - "13f1": "row 13, column 00, with white underlined text.", - "13f2": "row 13, column 04, with plain white text.", - "1373": "row 13, column 04, with white underlined text.", - "13f4": "row 13, column 08, with plain white text.", - "1375": "row 13, column 08, with white underlined text.", - "1376": "row 13, column 12, with plain white text.", - "13f7": "row 13, column 12, with white underlined text.", - "13f8": "row 13, column 16, with plain white text.", - "1379": "row 13, column 16, with white underlined text.", - "137a": "row 13, column 20, with plain white text.", - "13fb": "row 13, column 20, with white underlined text.", - "13fc": "row 13, column 24, with plain white text.", - "13fd": "row 13, column 24, with white underlined text.", - "13fe": "row 13, column 28, with plain white text.", - "137f": "row 13, column 28, with white underlined text.", - "9440": "row 14, column 00, with plain white text.", - "94c1": "row 14, column 00, with white underlined text.", - "94c2": "row 14, column 00, with plain green text.", - "9443": "row 14, column 00, with green underlined text.", - "94c4": "row 14, column 00, with plain blue text.", - "9445": "row 14, column 00, with blue underlined text.", - "9446": "row 14, column 00, with plain cyan text.", - "94c7": "row 14, column 00, with cyan underlined text.", - "94c8": "row 14, column 00, with plain red text.", - "9449": "row 14, column 00, with red underlined text.", - "944a": "row 14, column 00, with plain yellow text.", - "94cb": "row 14, column 00, with yellow underlined text.", - "944c": "row 14, column 00, with plain magenta text.", - "94cd": "row 14, column 00, with magenta underlined text.", - "94ce": "row 14, column 00, with white italicized text.", - "944f": "row 14, column 00, with white underlined italicized text.", - "94d0": "row 14, column 00, with plain white text.", - "9451": "row 14, column 00, with white underlined text.", - "9452": "row 14, column 04, with plain white text.", - "94d3": "row 14, column 04, with white underlined text.", - "9454": "row 14, column 08, with plain white text.", - "94d5": "row 14, column 08, with white underlined text.", - "94d6": "row 14, column 12, with plain white text.", - "9457": "row 14, column 12, with white underlined text.", - "9458": "row 14, column 16, with plain white text.", - "94d9": "row 14, column 16, with white underlined text.", - "94da": "row 14, column 20, with plain white text.", - "945b": "row 14, column 20, with white underlined text.", - "94dc": "row 14, column 24, with plain white text.", - "945d": "row 14, column 24, with white underlined text.", - "945e": "row 14, column 28, with plain white text.", - "94df": "row 14, column 28, with white underlined text.", - "94e0": "row 15, column 00, with plain white text.", - "9461": "row 15, column 00, with white underlined text.", - "9462": "row 15, column 00, with plain green text.", - "94e3": "row 15, column 00, with green underlined text.", - "9464": "row 15, column 00, with plain blue text.", - "94e5": "row 15, column 00, with blue underlined text.", - "94e6": "row 15, column 00, with plain cyan text.", - "9467": "row 15, column 00, with cyan underlined text.", - "9468": "row 15, column 00, with plain red text.", - "94e9": "row 15, column 00, with red underlined text.", - "94ea": "row 15, column 00, with plain yellow text.", - "946b": "row 15, column 00, with yellow underlined text.", - "94ec": "row 15, column 00, with plain magenta text.", - "946d": "row 15, column 00, with magenta underlined text.", - "946e": "row 15, column 00, with white italicized text.", - "94ef": "row 15, column 00, with white underlined italicized text.", - "9470": "row 15, column 00, with plain white text.", - "94f1": "row 15, column 00, with white underlined text.", - "94f2": "row 15, column 04, with plain white text.", - "9473": "row 15, column 04, with white underlined text.", - "94f4": "row 15, column 08, with plain white text.", - "9475": "row 15, column 08, with white underlined text.", - "9476": "row 15, column 12, with plain white text.", - "94f7": "row 15, column 12, with white underlined text.", - "94f8": "row 15, column 16, with plain white text.", - "9479": "row 15, column 16, with white underlined text.", - "947a": "row 15, column 20, with plain white text.", - "94fb": "row 15, column 20, with white underlined text.", - "94fc": "row 15, column 24, with plain white text.", - "94fd": "row 15, column 24, with white underlined text.", - "94fe": "row 15, column 28, with plain white text.", - "947f": "row 15, column 28, with white underlined text.", - "97a1": "Tab Offset 1 column", - "97a2": "Tab Offset 2 columns", - "9723": "Tab Offset 3 columns", - "94a1": "BackSpace", - "94a4": "Delete to End of Row", - "94ad": "Carriage Return", - "1020": "Background White", - "10a1": "Background Semi-Transparent White", - "10a2": "Background Green", - "1023": "Background Semi-Transparent Green", - "10a4": "Background Blue", - "1025": "Background Semi-Transparent Blue", - "1026": "Background Cyan", - "10a7": "Background Semi-Transparent Cyan", - "10a8": "Background Red", - "1029": "Background Semi-Transparent Red", - "102a": "Background Yellow", - "10ab": "Background Semi-Transparent Yellow", - "102c": "Background Magenta", - "10ad": "Background Semi-Transparent Magenta", - "10ae": "Background Black", - "102f": "Background Semi-Transparent Black", - "97ad": "Background Transparent", - "97a4": "Standard Character Set", - "9725": "Double-Size Character Set", - "9726": "First Private Character Set", - "97a7": "Second Private Character Set", - "97a8": "People`s Republic of China Character Set", - "9729": "Korean Standard Character Set", - "972a": "First Registered Character Set", - "9120": "White", - "91a1": "White Underline", - "91a2": "Green", - "9123": "Green Underline", - "91a4": "Blue", - "9125": "Blue Underline", - "9126": "Cyan", - "91a7": "Cyan Underline", - "91a8": "Red", - "9129": "Red Underline", - "912a": "Yellow", - "91ab": "Yellow Underline", - "912c": "Magenta", - "91ad": "Magenta Underline", - "97ae": "Black", - "972f": "Black Underline", - "91ae": "Italics", - "912f": "Italics Underline", - "94a8": "Flash ON", - "9423": "Alarm Off", - "94a2": "Alarm On" -} +from pycaption.scc.constants import ALL_CHARACTERS, COMMAND_LABELS def translate_scc(scc_content, brackets='[]'): """ Replaces hexadecimal words with their meaning - In order to make SCC files more human readable and easier to debug, + In order to make SCC files more human-readable and easier to debug, this function is used to replace command codes with their labels and character bytes with their actual characters diff --git a/pycaption/transcript.py b/pycaption/transcript.py index a65d9b3b..46df9f95 100644 --- a/pycaption/transcript.py +++ b/pycaption/transcript.py @@ -4,6 +4,7 @@ import nltk.data except ModuleNotFoundError: nltk = None + from pycaption.base import BaseWriter, CaptionNode diff --git a/pycaption/webvtt.py b/pycaption/webvtt.py index d40f02c3..805663f8 100644 --- a/pycaption/webvtt.py +++ b/pycaption/webvtt.py @@ -3,11 +3,11 @@ import sys from copy import deepcopy -from .base import ( - BaseReader, BaseWriter, CaptionSet, CaptionList, Caption, CaptionNode, -) +from .base import BaseReader, BaseWriter, Caption, CaptionList, CaptionNode, CaptionSet from .exceptions import ( - CaptionReadError, CaptionReadSyntaxError, CaptionReadNoCaptions, + CaptionReadError, + CaptionReadNoCaptions, + CaptionReadSyntaxError, InvalidInputError, ) from .geometry import HorizontalAlignmentEnum, Layout @@ -15,22 +15,22 @@ # A WebVTT timing line has both start/end times and layout related settings # (referred to as 'cue settings' in the documentation) # The following pattern captures [start], [end] and [cue settings] if existent -TIMING_LINE_PATTERN = re.compile(r'^(\S+)\s+-->\s+(\S+)(?:\s+(.*?))?\s*$') -TIMESTAMP_PATTERN = re.compile(r'^(\d+):(\d{2})(:\d{2})?\.(\d{3})') -VOICE_SPAN_PATTERN = re.compile(']*)>') +TIMING_LINE_PATTERN = re.compile(r"^(\S+)\s+-->\s+(\S+)(?:\s+(.*?))?\s*$") +TIMESTAMP_PATTERN = re.compile(r"^(\d+):(\d{2})(:\d{2})?\.(\d{3})") +VOICE_SPAN_PATTERN = re.compile("]*)>") OTHER_SPAN_PATTERN = re.compile( - r'' + r"" ) # These WebVTT tags are stripped off the cues on conversion WEBVTT_VERSION_OF = { - HorizontalAlignmentEnum.LEFT: 'left', - HorizontalAlignmentEnum.CENTER: 'center', - HorizontalAlignmentEnum.RIGHT: 'right', - HorizontalAlignmentEnum.START: 'start', - HorizontalAlignmentEnum.END: 'end' + HorizontalAlignmentEnum.LEFT: "left", + HorizontalAlignmentEnum.CENTER: "center", + HorizontalAlignmentEnum.RIGHT: "right", + HorizontalAlignmentEnum.START: "start", + HorizontalAlignmentEnum.END: "end", } -DEFAULT_ALIGN = 'start' +DEFAULT_ALIGN = "start" def microseconds(h, m, s, f): @@ -42,7 +42,9 @@ def microseconds(h, m, s, f): class WebVTTReader(BaseReader): - def __init__(self, ignore_timing_errors=True, time_shift_milliseconds=0, *args, **kwargs): + def __init__( + self, ignore_timing_errors=True, time_shift_milliseconds=0, *args, **kwargs + ): """ :param ignore_timing_errors: Whether to ignore timing checks :type ignore_timing_errors: bool @@ -53,11 +55,11 @@ def __init__(self, ignore_timing_errors=True, time_shift_milliseconds=0, *args, self.time_shift_microseconds = time_shift_milliseconds * 1000 def detect(self, content): - return 'WEBVTT' in content + return "WEBVTT" in content - def read(self, content, lang='en-US'): + def read(self, content, lang="en-US"): if not isinstance(content, str): - raise InvalidInputError('The content is not a unicode string.') + raise InvalidInputError("The content is not a unicode string.") caption_set = CaptionSet({lang: self._parse(content.splitlines())}) @@ -76,31 +78,30 @@ def _parse(self, lines): for i, line in enumerate(lines): - if '-->' in line: + if "-->" in line: found_timing = True timing_line = i last_start_time = captions[-1].start if captions else 0 try: start, end, layout_info = self._parse_timing_line( - line, last_start_time) + line, last_start_time + ) except CaptionReadError as e: - new_msg = f'{e.args[0]} (line {timing_line})' + new_msg = f"{e.args[0]} (line {timing_line})" tb = sys.exc_info()[2] raise type(e)(new_msg).with_traceback(tb) from None - elif '' == line: + elif "" == line: if found_timing and nodes: found_timing = False - caption = Caption( - start, end, nodes, layout_info=layout_info) + caption = Caption(start, end, nodes, layout_info=layout_info) captions.append(caption) nodes = [] else: if found_timing: if nodes: nodes.append(CaptionNode.create_break()) - nodes.append(CaptionNode.create_text( - self._decode(line))) + nodes.append(CaptionNode.create_text(self._decode(line))) else: # it's a comment or some metadata; ignore it pass @@ -113,21 +114,21 @@ def _parse(self, lines): return captions def _remove_styles(self, line): - partial_result = VOICE_SPAN_PATTERN.sub('\\2: ', line) - return OTHER_SPAN_PATTERN.sub('', partial_result) + partial_result = VOICE_SPAN_PATTERN.sub("\\2: ", line) + return OTHER_SPAN_PATTERN.sub("", partial_result) def _validate_timings(self, start, end, last_start_time): if start is None: - raise CaptionReadSyntaxError('Invalid cue start timestamp.') + raise CaptionReadSyntaxError("Invalid cue start timestamp.") if end is None: - raise CaptionReadSyntaxError('Invalid cue end timestamp.') + raise CaptionReadSyntaxError("Invalid cue end timestamp.") if start > end: - raise CaptionReadError( - 'End timestamp is not greater than start timestamp.') + raise CaptionReadError("End timestamp is not greater than start timestamp.") if start < last_start_time: raise CaptionReadError( - 'Start timestamp is not greater than or equal' - 'to start timestamp of previous cue.') + "Start timestamp is not greater than or equal" + "to start timestamp of previous cue." + ) def _parse_timing_line(self, line, last_start_time): """ @@ -135,7 +136,7 @@ def _parse_timing_line(self, line, last_start_time): """ m = TIMING_LINE_PATTERN.search(line) if not m: - raise CaptionReadSyntaxError('Invalid timing format.') + raise CaptionReadSyntaxError("Invalid timing format.") start = self._parse_timestamp(m.group(1)) + self.time_shift_microseconds end = self._parse_timestamp(m.group(2)) + self.time_shift_microseconds @@ -157,7 +158,7 @@ def _parse_timestamp(self, timestamp): """ m = TIMESTAMP_PATTERN.search(timestamp) if not m: - raise CaptionReadSyntaxError('Invalid timing format.') + raise CaptionReadSyntaxError("Invalid timing format.") m = m.groups() @@ -175,23 +176,23 @@ def _decode(self, s): """ s = s.strip() # Covert voice span - s = VOICE_SPAN_PATTERN.sub('\\2: ', s) + s = VOICE_SPAN_PATTERN.sub("\\2: ", s) # TODO: Add support for other WebVTT tags. For now just strip them # off the text. - s = OTHER_SPAN_PATTERN.sub('', s) + s = OTHER_SPAN_PATTERN.sub("", s) # Replace WebVTT special XML codes with plain unicode values - s = s.replace('<', '<') - s = s.replace('>', '>') - s = s.replace('‎', '\u200e') - s = s.replace('‏', '\u200f') - s = s.replace(' ', '\u00a0') + s = s.replace("<", "<") + s = s.replace(">", ">") + s = s.replace("‎", "\u200e") + s = s.replace("‏", "\u200f") + s = s.replace(" ", "\u00a0") # Must do ampersand last - s = s.replace('&', '&') + s = s.replace("&", "&") return s class WebVTTWriter(BaseWriter): - HEADER = 'WEBVTT\n\n' + HEADER = "WEBVTT\n\n" global_layout = None video_width = None video_height = None @@ -219,9 +220,9 @@ def write(self, caption_set, lang=None): captions = caption_set.get_captions(lang) - return output + '\n'.join( - [self._convert_caption(caption_set, caption) - for caption in captions]) + return output + "\n".join( + [self._convert_caption(caption_set, caption) for caption in captions] + ) def _timestamp(self, ts): td = datetime.timedelta(microseconds=ts) @@ -234,23 +235,23 @@ def _timestamp(self, ts): @staticmethod def _convert_style_to_text_tag(style): - if style == 'italics': - return ['', ''] - elif style == 'underline': - return ['', ''] - elif style == 'bold': - return ['', ''] + if style == "italics": + return ["", ""] + elif style == "underline": + return ["", ""] + elif style == "bold": + return ["", ""] else: - return ['', ''] + return ["", ""] def _calculate_resulting_style(self, style, caption_set): resulting_style = {} style_classes = [] - if 'classes' in style: - style_classes = style['classes'] - elif 'class' in style: - style_classes = [style['class']] + if "classes" in style: + style_classes = style["classes"] + elif "class" in style: + style_classes = [style["class"]] for style_class in style_classes: sub_style = caption_set.get_style(style_class).copy() @@ -271,11 +272,11 @@ def _convert_caption(self, caption_set, caption): start = self._timestamp(caption.start) end = self._timestamp(caption.end) - timespan = f'{start} --> {end}' + timespan = f"{start} --> {end}" - output = '' + output = "" - cue_style_tags = ['', ''] + cue_style_tags = ["", ""] # Text styling style = self._calculate_resulting_style(caption.style, caption_set) @@ -289,8 +290,8 @@ def _convert_caption(self, caption_set, caption): if not layout: layout = caption.layout_info or self.global_layout cue_settings = self._convert_positioning(layout) - output += timespan + cue_settings + '\n' - output += cue_style_tags[0] + cue_text + cue_style_tags[1] + '\n' + output += timespan + cue_settings + "\n" + output += cue_style_tags[0] + cue_text + cue_style_tags[1] + "\n" return output @@ -301,12 +302,12 @@ def _convert_positioning(self, layout): :rtype: str """ if not layout: - return '' + return "" # If it's converting from WebVTT to WebVTT, keep positioning info # unchanged if layout.webvtt_positioning: - return f' {layout.webvtt_positioning}' + return f" {layout.webvtt_positioning}" left_offset = None top_offset = None @@ -320,15 +321,14 @@ def _convert_positioning(self, layout): # There are absolute positioning values for this cue but the # Writer is explicitly configured not to do any relativization. # Ignore all positioning for this cue. - return '' + return "" # Ensure that all positioning values are measured using percentage. # This may raise an exception if layout.is_relative() == False # If you want to avoid it, you have to turn off relativization by # initializing this Writer with relativize=False. if not already_relative: - layout = layout.as_percentage_of( - self.video_width, self.video_height) + layout = layout.as_percentage_of(self.video_width, self.video_height) # Ensure that when there's a left offset the caption is not pushed out # of the screen. If the execution got this far it means origin and @@ -366,13 +366,13 @@ def _convert_positioning(self, layout): if layout.alignment: alignment = WEBVTT_VERSION_OF.get( - layout.alignment.horizontal, DEFAULT_ALIGN) + layout.alignment.horizontal, DEFAULT_ALIGN + ) else: alignment = DEFAULT_ALIGN - cue_settings = '' + cue_settings = "" - if alignment and \ - alignment != WEBVTT_VERSION_OF[HorizontalAlignmentEnum.CENTER]: + if alignment and alignment != WEBVTT_VERSION_OF[HorizontalAlignmentEnum.CENTER]: # Not sure why this condition was here, maybe because center # alignment is applied automatically without needing to specify it cue_settings += f" align:{alignment}" @@ -402,23 +402,22 @@ def _group_cues_by_layout(self, nodes, caption_set): layout_groups = [] # A properly encoded WebVTT string (plain unicode must be properly # escaped before being appended to this string) - s = '' - row, column, prev_row, prev_column = 0, 0, 0, 0 + s = "" for i, node in enumerate(nodes): if node.type_ == CaptionNode.TEXT: if s and current_layout and node.layout_info != current_layout: # If the positioning changes from one text node to # another, a new WebVTT cue has to be created. row, column = node.position if node.position else (0, 0) - prev_row, prev_column = current_node.position if current_node.position else (0, 0) - if row == prev_row + 1: - s += '\n' - else: + prev_row, prev_column = ( + current_node.position if current_node.position else (0, 0) + ) + if row != prev_row + 1: layout_groups.append((s, current_layout)) - s = '' + s = "" # ATTENTION: This is where the plain unicode node content is # finally encoded as WebVTT. - s += self._encode_illegal_characters(node.content) or ' ' + s += self._encode_illegal_characters(node.content) or " " current_layout = node.layout_info current_node = node elif node.type_ == CaptionNode.STYLE: @@ -426,7 +425,7 @@ def _group_cues_by_layout(self, nodes, caption_set): node.content, caption_set ) - styles = ['italics', 'underline', 'bold'] + styles = ["italics", "underline", "bold"] if not node.start: styles.reverse() @@ -442,10 +441,10 @@ def _group_cues_by_layout(self, nodes, caption_set): # "Style node" elif node.type_ == CaptionNode.BREAK: if i > 0 and nodes[i - 1].type_ != CaptionNode.TEXT: - s += ' ' + s += " " if i == 0: # cue text starts with a break - s += ' ' - s += '\n' + s += " " + s += "\n" if s: layout_groups.append((s, current_layout)) @@ -458,12 +457,12 @@ def _encode_illegal_characters(self, s): - http://dev.w3.org/html5/webvtt/#dfn-webvtt-cue-text-span :type s: str """ - s = s.replace('&', '&') - s = s.replace('<', '<') + s = s.replace("&", "&") + s = s.replace("<", "<") # The substring "-->" is also not allowed according to this: # - http://dev.w3.org/html5/webvtt/#dfn-webvtt-cue-block - s = s.replace('-->', '-->') + s = s.replace("-->", "-->") # The following characters have escaping codes for some reason, but # they're not illegal, so for now I'll leave this commented out so that diff --git a/setup.py b/setup.py index c1c2cf3e..9be9c5d6 100644 --- a/setup.py +++ b/setup.py @@ -1,62 +1,54 @@ #!/usr/bin/env python import os -from setuptools import setup, find_packages + +from setuptools import find_packages, setup README_PATH = os.path.join( os.path.abspath(os.path.dirname(__file__)), - 'README.rst', + "README.rst", ) dependencies = [ - 'beautifulsoup4>=4.12.1', - 'lxml>=4.9.1', - 'cssutils>=2.0.0', + "beautifulsoup4>=4.12.1", + "lxml>=4.9.1", + "cssutils>=2.0.0", ] -dev_dependencies = [ - 'pytest', - 'pytest-lazy-fixture' -] +dev_dependencies = ["pytest", "pytest-lazy-fixture"] -transcript_dependencies = [ - 'nltk==3.8.0' -] +transcript_dependencies = ["nltk==3.8.0"] setup( - name='pycaption', - version='2.2.12', - description='Closed caption converter', + name="pycaption", + version="2.2.12.dev8", + description="Closed caption converter", long_description=open(README_PATH).read(), - author='Joe Norton', - author_email='joey@nortoncrew.com', + author="Joe Norton", + author_email="joey@nortoncrew.com", project_urls={ - 'Source': 'https://github.com/pbs/pycaption', - 'Documentation': 'https://pycaption.readthedocs.io/', - 'Release notes': 'https://pycaption.readthedocs.io' - '/en/stable/changelog.html', + "Source": "https://github.com/pbs/pycaption", + "Documentation": "https://pycaption.readthedocs.io/", + "Release notes": "https://pycaption.readthedocs.io" "/en/stable/changelog.html", }, - python_requires='>=3.8,<4.0', + python_requires=">=3.8,<4.0", install_requires=dependencies, - extras_require={ - 'dev': dev_dependencies, - 'transcript': transcript_dependencies - }, + extras_require={"dev": dev_dependencies, "transcript": transcript_dependencies}, packages=find_packages(), include_package_data=True, classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'License :: OSI Approved :: Apache Software License', - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Programming Language :: Python :: 3.12', - 'Topic :: Software Development :: Libraries', - 'Topic :: Software Development :: Libraries :: Python Modules', - 'Topic :: Multimedia :: Video', + "Development Status :: 5 - Production/Stable", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Multimedia :: Video", ], test_suite="tests", ) diff --git a/tests/conftest.py b/tests/conftest.py index 74530ae8..acb97edb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -61,7 +61,20 @@ sample_scc_with_ampersand_character, sample_scc_multiple_formats, sample_scc_duplicate_tab_offset, sample_scc_duplicate_special_characters, sample_scc_tab_offset, sample_scc_with_unknown_commands, - sample_scc_special_and_extended_characters, + sample_scc_special_and_extended_characters, sample_scc_mid_row_before_text_pop, + sample_scc_mid_row_before_text_roll, sample_scc_mid_row_before_text_paint, + sample_scc_mid_row_following_text_no_text_before_italics_off_pop, + sample_scc_mid_row_following_text_no_text_before_italics_off_roll, + sample_scc_mid_row_following_text_no_text_before_italics_off_paint, + sample_scc_mid_row_following_text_no_text_before_italics_on_pop, + sample_scc_mid_row_following_text_no_text_before_italics_on_roll, + sample_scc_mid_row_following_text_no_text_before_italics_on_paint, + sample_scc_mid_row_with_space_before_pop, + sample_scc_mid_row_with_space_before_roll, + sample_scc_mid_row_with_space_before_paint, + sample_scc_with_spaces_at_eol_pop, + sample_scc_with_spaces_at_eol_roll, + sample_scc_with_spaces_at_eol_paint, ) from tests.fixtures.srt import ( # noqa: F401 sample_srt, sample_srt_ascii, sample_srt_numeric, sample_srt_empty, diff --git a/tests/fixtures/dfxp.py b/tests/fixtures/dfxp.py index 714e5d52..864f3e7a 100644 --- a/tests/fixtures/dfxp.py +++ b/tests/fixtures/dfxp.py @@ -729,6 +729,7 @@ def sample_dfxp_to_render_with_only_default_positioning_input(): """ + ## # When converting from DFXP to DFXP, notice the extra region "r0" is added, to # support the spam that sets the "tts:textAlign" attribute. @@ -996,28 +997,28 @@ def sample_dfxp_with_properly_closing_spans_output():
-

+

cccccc
c!c!

-

+

bbbb

-

- cccc
- bbaa
+

+ cccc
+ bbaa

-

+

aa

-

- bb
- cc
+

+ bb
+ cc

-

+

abcd

-

+

abcd

@@ -1525,4 +1526,4 @@ def sample_dfxp_default_styling_p_tags():

-""" \ No newline at end of file +""" diff --git a/tests/fixtures/scc.py b/tests/fixtures/scc.py index a1b66892..e09a10d3 100644 --- a/tests/fixtures/scc.py +++ b/tests/fixtures/scc.py @@ -6,19 +6,19 @@ def sample_scc_created_dfxp_with_wrongly_closing_spans(): return """\ Scenarist_SCC V1.0 -00:01:28;09 9420 942f 94ae 9420 9452 8080 e3e3 e3e3 e3e3 9470 8080 e3a1 e3a1 +00:01:28;09 9420 94ae 9420 9452 8080 e3e3 e3e3 e3e3 9470 8080 e3a1 e3a1 942f 00:01:31;10 9420 942f 94ae -00:01:31;18 9420 9454 6262 6262 9458 8080 91ae e3e3 e3e3 9470 8080 6262 6161 +00:01:31;18 9420 9454 6262 6262 9458 8080 91ae e3e3 e3e3 9470 8080 6262 6161 942f 00:01:35;18 9420 942f 94ae 00:01:40;25 942c -00:01:51;18 9420 9452 8080 6161 94da 8080 91ae 6262 9470 8080 e3e3 +00:01:51;18 9420 9452 8080 6161 94da 8080 91ae 6262 9470 8080 e3e3 942f -00:01:55;22 9420 942f 6162 e364 94f4 8080 6162 e364 +00:01:55;22 9420 6162 e364 94f4 8080 6162 e364 942f 00:01:59;14 9420 942f 94ae """ @@ -81,7 +81,7 @@ def sample_scc_pop_on(): def sample_scc_multiple_positioning(): return """Scenarist_SCC V1.0 -00:00:00:16 94ae 94ae 9420 9420 1370 1370 6162 6162 91d6 91d6 e364 e364 927c 927c e5e6 e5e6 942c 942c 942f 942f +00:00:00:16 94ae 94ae 9420 9420 1370 1370 6162 6162 91d6 91d6 e364 e364 92fd 92fd e5e6 e5e6 942c 942c 942f 942f 00:00:02:16 94ae 94ae 9420 9420 16f2 16f2 6768 6768 9752 9752 e9ea e9ea 97f2 97f2 6bec 6bec 942c 942c 942f 942f @@ -485,3 +485,153 @@ def sample_scc_with_line_too_long(): 00:00:08;58 9420 9452 4920 ea75 73f4 20f7 616e f4e5 6420 ef6e e520 7368 eff7 2c80 94f2 ea75 73f4 20f4 ef20 6861 76e5 2061 7320 6120 ece9 f4f4 ece5 942c 8080 8080 942f """ + + +@pytest.fixture(scope="function") +def sample_scc_mid_row_before_text_pop(): + return """\ +Scenarist_SCC V1.0 + +00:00:01:24 9420 91d0 9120 c1c2 20c1 c280 942f + +""" + + +@pytest.fixture(scope="function") +def sample_scc_mid_row_before_text_roll(): + return """\ +Scenarist_SCC V1.0 + +00:00:01:24 9425 91d0 9120 c1c2 20c1 c280 + +""" + + +@pytest.fixture(scope="session") +def sample_scc_mid_row_before_text_paint(): + return """\ +Scenarist_SCC V1.0 + +00:00:01:24 9429 91d0 9120 c1c2 20c1 c280 + +""" + + +@pytest.fixture(scope="session") +def sample_scc_mid_row_following_text_no_text_before_italics_off_pop(): + return """\ +Scenarist_SCC V1.0 + +00:00:01:24 9420 91ce 91ab 91ae c1c2 9120 c1c2 942f + +""" + + +@pytest.fixture(scope="session") +def sample_scc_mid_row_following_text_no_text_before_italics_off_roll(): + return """\ +Scenarist_SCC V1.0 + +00:00:01:24 9425 91ce 91ab 91ae c1c2 9120 c1c2 + +""" + + +@pytest.fixture(scope="session") +def sample_scc_mid_row_following_text_no_text_before_italics_off_paint(): + return """\ +Scenarist_SCC V1.0 + +00:00:01:24 9429 91ce 91ab 91ae c1c2 9120 c1c2 + +""" + + +@pytest.fixture(scope="session") +def sample_scc_mid_row_following_text_no_text_before_italics_on_pop(): + return """\ +Scenarist_SCC V1.0 + +00:00:01:24 9420 91d0 c1c2 91ae c1c2 942f + +""" + + +@pytest.fixture(scope="session") +def sample_scc_mid_row_following_text_no_text_before_italics_on_roll(): + return """\ +Scenarist_SCC V1.0 + +00:00:01:24 9425 91d0 c1c2 91ae c1c2 + +""" + + +@pytest.fixture(scope="session") +def sample_scc_mid_row_following_text_no_text_before_italics_on_paint(): + return """\ +Scenarist_SCC V1.0 + +00:00:01:24 9429 91d0 c1c2 91ae c1c2 + +""" + + +@pytest.fixture(scope="session") +def sample_scc_mid_row_with_space_before_pop(): + return """\ +Scenarist_SCC V1.0 + +00:00:01:24 9420 91d0 c180 c220 91ae c1c2 942f + +""" + + +@pytest.fixture(scope="session") +def sample_scc_mid_row_with_space_before_roll(): + return """\ +Scenarist_SCC V1.0 + +00:00:01:24 9425 91d0 c180 c220 91ae c1c2 + +""" + + +@pytest.fixture(scope="session") +def sample_scc_mid_row_with_space_before_paint(): + return """\ +Scenarist_SCC V1.0 + +00:00:01:24 9429 91d0 c180 c220 91ae c1c2 + +""" + + +@pytest.fixture(scope="session") +def sample_scc_with_spaces_at_eol_pop(): + return """\ +Scenarist_SCC V1.0 + +00:00:01:24 9420 91d0 c180 c220 91e0 c1c2 2020 2080 92c2 c1c2 2080 942f + +""" + + +@pytest.fixture(scope="session") +def sample_scc_with_spaces_at_eol_roll(): + return """\ +Scenarist_SCC V1.0 + +00:00:01:24 9425 91d0 c180 c220 91e0 c1c2 2020 2080 92c2 c1c2 2080 + +""" + + +@pytest.fixture(scope="session") +def sample_scc_with_spaces_at_eol_paint(): + return """\ +Scenarist_SCC V1.0 + +00:00:01:24 9429 91d0 c180 c220 91e0 c1c2 2020 2080 92c2 c1c2 2080 + +""" diff --git a/tests/test_scc.py b/tests/test_scc.py index 3b78e138..b6076202 100644 --- a/tests/test_scc.py +++ b/tests/test_scc.py @@ -1,13 +1,12 @@ import pytest -from pycaption import SCCReader, CaptionReadNoCaptions, CaptionNode -from pycaption.exceptions import CaptionReadTimingError, CaptionLineLengthError -from pycaption.geometry import ( - UnitEnum, HorizontalAlignmentEnum, VerticalAlignmentEnum, -) +from pycaption import CaptionNode, CaptionReadNoCaptions, SCCReader +from pycaption.exceptions import CaptionLineLengthError, CaptionReadTimingError +from pycaption.geometry import HorizontalAlignmentEnum, UnitEnum, VerticalAlignmentEnum from pycaption.scc.constants import MICROSECONDS_PER_CODEWORD from pycaption.scc.specialized_collections import ( - InstructionNodeCreator, TimingCorrectingCaptionList, + InstructionNodeCreator, + TimingCorrectingCaptionList, ) from pycaption.scc.state_machines import DefaultProvidingPositionTracker from tests.mixins import ReaderTestingMixIn @@ -54,11 +53,12 @@ def test_proper_timestamps(self, sample_scc_pop_on): def test_invalid_timestamps(self, sample_scc_pop_on): with pytest.raises(CaptionReadTimingError) as exc_info: - SCCReader().read(sample_scc_pop_on.replace(':', '.')) + SCCReader().read(sample_scc_pop_on.replace(":", ".")) assert exc_info.value.args[0].startswith( "Timestamps should follow the hour:minute:seconds;frames or " "hour:minute:seconds:frames format. Please correct the following " - "time:") + "time:" + ) def test_empty_file(self, sample_scc_empty): with pytest.raises(CaptionReadNoCaptions): @@ -80,11 +80,12 @@ def test_positioning(self, sample_scc_multiple_positioning): ((20.0, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)), ((70.0, UnitEnum.PERCENT), (11.0, UnitEnum.PERCENT)), ((40.0, UnitEnum.PERCENT), (41.0, UnitEnum.PERCENT)), - ((20.0, UnitEnum.PERCENT), (71.0, UnitEnum.PERCENT)) + ((20.0, UnitEnum.PERCENT), (71.0, UnitEnum.PERCENT)), ] + actual_positioning = [ caption_.layout_info.origin.serialized() - for caption_ in captions.get_captions('en-US') + for caption_ in captions.get_captions("en-US") ] assert expected_positioning == actual_positioning @@ -99,12 +100,12 @@ def test_tab_offset(self, sample_scc_tab_offset): ((27.5, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)), ((30.0, UnitEnum.PERCENT), (89.0, UnitEnum.PERCENT)), ((35.0, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)), - ((17.5, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)) + ((17.5, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)), ] actual_positioning = [ caption_.layout_info.origin.serialized() - for caption_ in captions.get_captions('en-US') + for caption_ in captions.get_captions("en-US") ] assert expected_positioning == actual_positioning @@ -126,111 +127,130 @@ def switches_italics(node): return node.start caption_set = SCCReader().read(sample_scc_with_italics) - nodes = caption_set.get_captions('en-US')[0].nodes + nodes = caption_set.get_captions("en-US")[0].nodes # We assert that the text is specified in italics. # If Style nodes are replaced, the way these 3 assertions are made # will most likely change assert switches_italics(nodes[0]) is True assert switches_italics(nodes[2]) is False - assert nodes[1].content == 'abababab' + assert nodes[1].content == "abababab" def test_default_positioning_when_no_positioning_is_specified( - self, sample_no_positioning_at_all_scc): + self, sample_no_positioning_at_all_scc + ): caption_set = SCCReader().read(sample_no_positioning_at_all_scc) actual_caption_layouts = [ caption.layout_info.serialized() - for caption in caption_set.get_captions('en-US') + for caption in caption_set.get_captions("en-US") ] expected_caption_layouts = [ - (((10.0, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)), - None, None, - (HorizontalAlignmentEnum.LEFT, VerticalAlignmentEnum.TOP)), - (((10.0, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)), - None, None, - (HorizontalAlignmentEnum.LEFT, VerticalAlignmentEnum.TOP)) + ( + ((10.0, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)), + None, + None, + (HorizontalAlignmentEnum.LEFT, VerticalAlignmentEnum.TOP), + ), + ( + ((10.0, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)), + None, + None, + (HorizontalAlignmentEnum.LEFT, VerticalAlignmentEnum.TOP), + ), ] actual_node_layout_infos = [ {idx: [node.layout_info.serialized() for node in caption.nodes]} - for idx, caption in enumerate(caption_set.get_captions('en-US')) + for idx, caption in enumerate(caption_set.get_captions("en-US")) ] expected_node_layout_infos = [ - {0: [( - ((10.0, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)), - None, None, - (HorizontalAlignmentEnum.LEFT, VerticalAlignmentEnum.TOP) - )]}, - {1: [( - ((10.0, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)), - None, None, - (HorizontalAlignmentEnum.LEFT, VerticalAlignmentEnum.TOP) - )]} + { + 0: [ + ( + ((10.0, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)), + None, + None, + (HorizontalAlignmentEnum.LEFT, VerticalAlignmentEnum.TOP), + ) + ] + }, + { + 1: [ + ( + ((10.0, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)), + None, + None, + (HorizontalAlignmentEnum.LEFT, VerticalAlignmentEnum.TOP), + ) + ] + }, ] assert expected_node_layout_infos == actual_node_layout_infos assert expected_caption_layouts == actual_caption_layouts def test_timing_is_properly_set_on_split_captions( - self, sample_scc_produces_captions_with_start_and_end_time_the_same + self, sample_scc_produces_captions_with_start_and_end_time_the_same ): caption_set = SCCReader().read( sample_scc_produces_captions_with_start_and_end_time_the_same ) expected_timings = [ - ('00:01:35.633', '00:01:40.833'), - ('00:01:35.633', '00:01:40.833'), - ('00:01:35.633', '00:01:40.833'), + ("00:01:35.633", "00:01:40.833"), + ("00:01:35.633", "00:01:40.833"), + ("00:01:35.633", "00:01:40.833"), ] actual_timings = [ (c_.format_start(), c_.format_end()) - for c_ in caption_set.get_captions('en-US') + for c_ in caption_set.get_captions("en-US") ] assert expected_timings == actual_timings def test_skip_extended_characters_ascii_duplicate( - self, sample_scc_with_extended_characters): + self, sample_scc_with_extended_characters + ): caption_set = SCCReader().read(sample_scc_with_extended_characters) - captions = caption_set.get_captions('en-US') - assert captions[0].nodes[0].content == 'MÄRTHA:' - expected_result = ['JUNIOR: ¡Yum!', None, 'Ya me siento mucho mejor.'] + captions = caption_set.get_captions("en-US") + assert captions[0].nodes[0].content == "MÄRTHA:" + expected_result = ["JUNIOR: ¡Yum!", None, "Ya me siento mucho mejor."] content = [node.content for node in captions[1].nodes] assert all(result in expected_result for result in content) def test_skip_duplicate_tab_offset(self, sample_scc_duplicate_tab_offset): expected_lines = [ - '[Radio reporter]', - 'The I-10 Santa Monica Freeway', - 'westbound is jammed,', - 'due to a three-car accident', - 'blocking lanes 1 and 2' + "[Radio reporter]", + "The I-10 Santa Monica Freeway", + "westbound is jammed,", + "due to a three-car accident", + "blocking lanes 1 and 2", ] caption_set = SCCReader().read(sample_scc_duplicate_tab_offset) actual_lines = [ node.content - for cap_ in caption_set.get_captions('en-US') + for cap_ in caption_set.get_captions("en-US") for node in cap_.nodes if node.type_ == CaptionNode.TEXT ] assert expected_lines == actual_lines def test_skip_duplicate_special_characters( - self, sample_scc_duplicate_special_characters): + self, sample_scc_duplicate_special_characters + ): expected_lines = [ - '®°½¿™¢£♪à èâêîôû', - '®°½¿™¢£♪à èâêîôû', - '®°AA½¿™¢£♪à èâêAAîôû' + "®°½¿™¢£♪à èâêîôû", + "®°½¿™¢£♪à èâêîôû", + "®°AA½¿™¢£♪à èâêAAîôû", ] caption_set = SCCReader().read(sample_scc_duplicate_special_characters) actual_lines = [ node.content - for cap_ in caption_set.get_captions('en-US') + for cap_ in caption_set.get_captions("en-US") for node in cap_.nodes if node.type_ == CaptionNode.TEXT ] @@ -241,53 +261,174 @@ def test_flashing_cue(self, sample_scc_flashing_cue): SCCReader().read(sample_scc_flashing_cue) assert exc_info.value.args[0].startswith( - "Unsupported cue duration around 00:00:20.433") + "Unsupported cue duration around 00:00:20.433" + ) def test_line_too_long(self, sample_scc_with_line_too_long): with pytest.raises(CaptionLineLengthError) as exc_info: SCCReader().read(sample_scc_with_line_too_long) assert exc_info.value.args[0].startswith( - "32 character limit for caption cue in scc file.") - str_to_check = ("around 00:00:05.900 - was Cal l l l l l l l l l l l l l l l l l l l l l l l l l l l l " - "Denison, a friend - Length 81") - assert str_to_check in exc_info.value.args[0].split("\n") + "32 character limit for caption cue in scc file." + ) + str_to_check = ( + "was Cal l l l l l l l l l l l l l l l l l l l l l l l l l l l l " + "Denison, a friend - Length 81" + ) + assert "around 00:00:05.900" in exc_info.value.args[0].split("\n")[2] + assert str_to_check in exc_info.value.args[0].split("\n")[2] + + def test_mid_row_codes_not_adding_space_before_text( + self, + sample_scc_mid_row_before_text_pop, + sample_scc_mid_row_before_text_roll, + sample_scc_mid_row_before_text_paint, + ): + # if mid-row code is before any text in the cue, no space + # should be added + expected_lines = ["AB AB"] # no space before first A + for caption in [ + sample_scc_mid_row_before_text_pop, + sample_scc_mid_row_before_text_roll, + sample_scc_mid_row_before_text_paint, + ]: + caption_set = SCCReader().read(caption) + actual_lines = [ + node.content + for cap_ in caption_set.get_captions("en-US") + for node in cap_.nodes + if node.type_ == CaptionNode.TEXT + ] + assert expected_lines == actual_lines + + def test_mid_row_codes_adding_space_after_text_if_there_is_none_closing_style( + self, + sample_scc_mid_row_following_text_no_text_before_italics_off_pop, + sample_scc_mid_row_following_text_no_text_before_italics_off_roll, + sample_scc_mid_row_following_text_no_text_before_italics_off_paint, + ): + # if there's no space in between text nodes it should add one + # since 9120 is closing italics, the space will be added before + # second text node + expected_lines = ["AB", " AB"] + for caption in [ + sample_scc_mid_row_following_text_no_text_before_italics_off_pop, + sample_scc_mid_row_following_text_no_text_before_italics_off_roll, + sample_scc_mid_row_following_text_no_text_before_italics_off_paint, + ]: + caption_set = SCCReader().read(caption) + actual_lines = [ + node.content + for cap_ in caption_set.get_captions("en-US") + for node in cap_.nodes + if node.type_ == CaptionNode.TEXT + ] + assert expected_lines == actual_lines + + def test_mid_row_codes_adding_space_after_text_if_there_is_none_opening_style( + self, + sample_scc_mid_row_following_text_no_text_before_italics_on_pop, + sample_scc_mid_row_following_text_no_text_before_italics_on_roll, + sample_scc_mid_row_following_text_no_text_before_italics_on_paint, + ): + # if there's no space in between text nodes it should add one + # since 91ae is opening italics, the space will be added at the end + # of the first text node + expected_lines = ["AB ", "AB"] + for caption in [ + sample_scc_mid_row_following_text_no_text_before_italics_on_pop, + sample_scc_mid_row_following_text_no_text_before_italics_on_roll, + sample_scc_mid_row_following_text_no_text_before_italics_on_paint, + ]: + caption_set = SCCReader().read(caption) + actual_lines = [ + node.content + for cap_ in caption_set.get_captions("en-US") + for node in cap_.nodes + if node.type_ == CaptionNode.TEXT + ] + assert expected_lines == actual_lines + + def test_mid_row_codes_not_adding_space_if_there_is_one_before( + self, + sample_scc_mid_row_with_space_before_pop, + sample_scc_mid_row_with_space_before_roll, + sample_scc_mid_row_with_space_before_paint, + ): + # if mid-row code following a text node that ends in space + # no additional space will be added + expected_lines = ["AB ", "AB"] + for caption in [ + sample_scc_mid_row_with_space_before_pop, + sample_scc_mid_row_with_space_before_roll, + sample_scc_mid_row_with_space_before_paint, + ]: + # no additional space added (will not be 'AB ') + caption_set = SCCReader().read(caption) + actual_lines = [ + node.content + for cap_ in caption_set.get_captions("en-US") + for node in cap_.nodes + if node.type_ == CaptionNode.TEXT + ] + assert expected_lines == actual_lines + + def test_removing_spaces_at_end_of_lines( + self, + sample_scc_with_spaces_at_eol_pop, + sample_scc_with_spaces_at_eol_roll, + sample_scc_with_spaces_at_eol_paint, + ): + expected_lines = ["AB", "AB", "AB"] + for caption in [ + sample_scc_with_spaces_at_eol_pop, + sample_scc_with_spaces_at_eol_roll, + sample_scc_with_spaces_at_eol_paint, + ]: + caption_set = SCCReader().read(caption) + actual_lines = [ + node.content + for cap_ in caption_set.get_captions("en-US") + for node in cap_.nodes + if node.type_ == CaptionNode.TEXT + ] + assert expected_lines == actual_lines class TestCoverageOnly: """In order to refactor safely, we need coverage of 95% or more. - This class includes tests that ensure that at the very least, we don't - break anything that was working, OR fix anything whose faulty behavior - was accepted. + This class includes tests that ensure that at the very least, we don't + break anything that was working, OR fix anything whose faulty behavior + was accepted. - All the tests in this suite should only be useful for refactoring. They - DO NOT ensure functionality. They only ensure nothing changes. + All the tests in this suite should only be useful for refactoring. They + DO NOT ensure functionality. They only ensure nothing changes. """ def test_freeze_rollup_captions_contents(self, sample_scc_roll_up_ru2): # There were no tests for ROLL-UP captions, but the library processed # Roll-Up captions. Make sure nothing changes during the refactoring scc1 = SCCReader().read(sample_scc_roll_up_ru2) - captions = scc1.get_captions('en-US') + captions = scc1.get_captions("en-US") actual_texts = [cap_.nodes[0].content for cap_ in captions] expected_texts = [ - '>>> HI.', + ">>> HI.", "I'M KEVIN CUNNING AND AT", "INVESTOR'S BANK WE BELIEVE IN", - 'HELPING THE LOCAL NEIGHBORHOODS', - 'AND IMPROVING THE LIVES OF ALL', - 'WE SERVE.', - '®°½', - '®°½½', - 'ABû', - 'ÁÉÓ¡', + "HELPING THE LOCAL NEIGHBORHOODS", + "AND IMPROVING THE LIVES OF ALL", + "WE SERVE.", + "®°½", + "®°½½", + "ABû", + "ÁÉÓ¡", "WHERE YOU'RE STANDING NOW,", "LOOKING OUT THERE, THAT'S AL", - 'THE CROWD.', - '>> IT WAS GOOD TO BE IN TH', + "THE CROWD.", + ">> IT WAS GOOD TO BE IN TH", "And restore Iowa's land, water", - 'And wildlife.', - '>> Bike Iowa, your source for' + "And wildlife.", + ">> Bike Iowa, your source for", ] assert expected_texts == actual_texts @@ -296,24 +437,23 @@ def test_multiple_formats(self, sample_scc_multiple_formats): # ensure the paint on lines are not repeated expected_text_lines = [ "(Client's Voice)", - 'Remember that degree', - 'you got in taxation?', - '(Danny)', + "Remember that degree", + "you got in taxation?", + "(Danny)", "Of course you don't", "because you didn't!", "Your job isn't doing hard", - 'work...', + "work...", "...it's making them do hard", - 'work...', - '...and getting paid for it.', - '(VO)', - 'Snap and sort your expenses to', - 'save over $4,600 at tax time.', - 'QUICKBOOKS. BACKING YOU.', + "work...", + "...and getting paid for it.", + "(VO)", + "Snap and sort your expenses to", + "save over $4,600 at tax time.", + "QUICKBOOKS. BACKING YOU.", ] - captions = SCCReader().read(sample_scc_multiple_formats) \ - .get_captions('en-US') + captions = SCCReader().read(sample_scc_multiple_formats).get_captions("en-US") text_lines = [ node.content for caption in captions @@ -325,7 +465,7 @@ def test_multiple_formats(self, sample_scc_multiple_formats): def test_freeze_semicolon_spec_time(self, sample_scc_roll_up_ru3): scc1 = SCCReader().read(sample_scc_roll_up_ru3) - captions = scc1.get_captions('en-US') + captions = scc1.get_captions("en-US") expected_timings = [ (733333.3333333333, 2766666.6666666665), (2766666.6666666665, 4566666.666666666), @@ -363,8 +503,7 @@ def test_freeze_colon_spec_time(self, sample_scc_pop_on): (32132100.000000004, 36169466.666666664), ] - actual_timings = [ - (c_.start, c_.end) for c_ in scc1.get_captions('en-US')] + actual_timings = [(c_.start, c_.end) for c_ in scc1.get_captions("en-US")] assert expected_timings == actual_timings @@ -385,72 +524,320 @@ def test_italics_commands_are_formatted_properly(self): # 9120 and 91ae are mid row codes and will add a space # 9120 at the start of the following text node # 91ae to the end of the previous text node - node_creator.interpret_command('9470') # row 15, col 0 - node_creator.interpret_command('9120') # italics off - node_creator.interpret_command('9120') # italics off - node_creator.add_chars('a') - - node_creator.interpret_command('9770') # row 10 col 0 - node_creator.interpret_command('91ae') # italics ON - node_creator.add_chars('b') - node_creator.interpret_command('91ae') # italics ON - node_creator.interpret_command('91ae') # italics ON - node_creator.interpret_command('9120') # italics OFF - node_creator.interpret_command('9120') # italics OFF - node_creator.interpret_command('91ae') # italics ON - node_creator.interpret_command('91ae') # italics ON - node_creator.interpret_command('91ae') # italics ON - node_creator.add_chars('b') - node_creator.interpret_command('91ae') # italics ON again - node_creator.add_chars('b') - node_creator.interpret_command('9120') # italics OFF adds space - node_creator.interpret_command('9120') # italics OFF - - node_creator.interpret_command('1570') # row 6 col 0 - node_creator.add_chars('c') - node_creator.interpret_command('91ae') # italics ON - - node_creator.interpret_command('9270') # row 4 col 0 - node_creator.add_chars('d') - - node_creator.interpret_command('15d0') # row 5 col 0 - creates BR - node_creator.add_chars('e') - - node_creator.interpret_command('1570') # row 6 col 0 - creates BR - node_creator.add_chars('f') + node_creator.interpret_command("9470") # 0 - row 15, col 0 + node_creator.interpret_command("9120") # italics off + node_creator.interpret_command("9120") # italics off + node_creator.add_chars("a") + + node_creator.interpret_command("9770") # row 10 col 0 + node_creator.interpret_command("91ae") # italics ON plain + node_creator.add_chars("b") + node_creator.interpret_command("91ae") # italics ON + node_creator.interpret_command("91ae") # italics ON + node_creator.interpret_command("9120") # italics OFF + node_creator.interpret_command("9120") # italics OFF + node_creator.interpret_command("91ae") # italics ON + node_creator.interpret_command("91ae") # italics ON + node_creator.interpret_command("91ae") # italics ON + node_creator.add_chars("b") + node_creator.interpret_command("91ae") # italics ON again + node_creator.add_chars("b") + node_creator.interpret_command("9120") # italics OFF adds space + node_creator.interpret_command("9120") # italics OFF + + node_creator.interpret_command("1570") # row 6 col 0 + node_creator.add_chars("c") + node_creator.interpret_command("91ae") # italics ON + + node_creator.interpret_command("9270") # row 4 col 0 + node_creator.add_chars("d") + + node_creator.interpret_command("15d0") # row 5 col 0 - creates BR + node_creator.add_chars("e") + + node_creator.interpret_command("1570") # row 6 col 0 - creates BR + node_creator.add_chars("f") result = list(node_creator) assert result[0].is_text_node() - assert result[1].is_text_node() - assert result[2].requires_repositioning() + assert result[1].requires_repositioning() + assert result[2].sets_italics_on() - assert result[3].is_italics_node() - assert result[3].sets_italics_on() + assert result[3].is_text_node() assert result[4].is_text_node() assert result[5].sets_italics_off() assert result[6].is_text_node() - assert result[7].is_text_node() - assert result[8].sets_italics_on() + assert result[7].requires_repositioning() + assert result[8].is_text_node() - assert result[9].is_text_node() + assert result[9].requires_repositioning() assert result[10].is_text_node() - assert result[11].sets_italics_off() + assert result[11].is_explicit_break() assert result[12].is_text_node() - assert result[13].is_text_node() - assert result[14].requires_repositioning() - assert result[15].is_text_node() + assert result[13].is_explicit_break() + assert result[14].is_text_node() + + @staticmethod + def check_closing_italics_closing_on_style_change(node_creator): + node_creator.interpret_command("9140") # row 01, col 0 plaintext + node_creator.interpret_command("9120") # plaintext = italics off + node_creator.interpret_command("9120") # plaintext = italics off + assert ( + len(node_creator._collection) == 0 + ) # will get cleaned by format_italics + node_creator.interpret_command("91ce") # row 01, col 0 italics + assert len(node_creator._collection) == 1 + assert node_creator._collection[0].sets_italics_on() + node_creator.add_chars("a", "b") # write ab + assert node_creator.last_style == "italics on" + assert node_creator._collection[-1].is_text_node() + assert node_creator._collection[-1].text == "ab" + node_creator.interpret_command("91ab") + # mid-row for Yellow Underline, it should close the italics and + # add a space after text, so the last text node will contain a space + assert node_creator.last_style == "italics off" + assert node_creator._collection[-1].text == " " + node_creator.interpret_command("91ae") # italics again + # it should re-open italics + node_creator.add_chars("c", "d") + assert node_creator.last_style == "italics on" + assert node_creator._collection[-1].text == "cd" + # let's break the line now, and keep the style + node_creator.interpret_command("916e") # row 02, column 00 with italics + node_creator.add_chars("e", "f") + assert node_creator.last_style == "italics on" + assert node_creator._collection[-1].text == "ef" + # let's break the line now, but change the style this time + # it should close italics then break + node_creator.interpret_command("9252") # row 03, column 04 with plaintext + node_creator.add_chars("g", "h") + assert node_creator.last_style == "italics off" + assert node_creator._collection[-1].text == "gh" + # check that we have a closed italic before break + assert node_creator._collection[-2].is_explicit_break() + assert node_creator._collection[-3].sets_italics_off() + # open italics again with a mid-row code + # should open italics and add a space after text: "gh" -> "gh " + node_creator.interpret_command("91ae") + assert node_creator.last_style == "italics on" + assert node_creator._collection[-2].text == "gh " + # send another italics command which should be ignored since there is + # already an open italics tag + collection_length = len(node_creator._collection) + node_creator.interpret_command("91ae") + assert node_creator.last_style == "italics on" + assert node_creator._collection[-2].text == "gh " + # no additional node is added + assert len(node_creator._collection) == collection_length + + def test_closing_italics_closing_on_style_change(self): + node_creator = InstructionNodeCreator( + position_tracker=(DefaultProvidingPositionTracker()) + ) + # pop-up + node_creator.interpret_command("9420") + self.check_closing_italics_closing_on_style_change(node_creator) + # roll + node_creator = InstructionNodeCreator( + position_tracker=(DefaultProvidingPositionTracker()) + ) + node_creator.interpret_command("9425") + self.check_closing_italics_closing_on_style_change(node_creator) + # pop-up + node_creator = InstructionNodeCreator( + position_tracker=(DefaultProvidingPositionTracker()) + ) + node_creator.interpret_command("9429") + self.check_closing_italics_closing_on_style_change(node_creator) + + def test_remove_noon_off_on_italics(self): + from pycaption.scc.specialized_collections import ( + _InstructionNode, + _remove_noon_off_on_italics, + ) + + position_tracker = DefaultProvidingPositionTracker().default + node_creator = InstructionNodeCreator( + position_tracker=(DefaultProvidingPositionTracker()) + ) + node_creator.interpret_command("9420") + node_creator.interpret_command("9140") # row 01, col 0 plaintext + node_creator.add_chars("a", "b") + node_creator._collection.append( + _InstructionNode.create_italics_style(position_tracker, turn_on=False) + ) + node_creator._collection.append( + _InstructionNode.create_italics_style(position_tracker, turn_on=True) + ) + # last two nodes should be + assert node_creator._collection[-2].sets_italics_off() + assert node_creator._collection[-1].sets_italics_on() + + new_collection = _remove_noon_off_on_italics(node_creator._collection) + + # should eliminate italic tags, keep only the text node + assert len(new_collection) == 1 + assert new_collection[0].is_text_node() + assert new_collection[0].text == "ab" + + # check if there's text in between close/open italics + node_creator._collection.append( + _InstructionNode.create_italics_style(position_tracker, turn_on=False) + ) + node_creator.add_chars("c", "d") + node_creator._collection.append( + _InstructionNode.create_italics_style(position_tracker, turn_on=True) + ) + + assert node_creator._collection[-3].sets_italics_off() + assert node_creator._collection[-2].is_text_node() + assert node_creator._collection[-1].sets_italics_on() + + new_collection = _remove_noon_off_on_italics(node_creator._collection) + # should not eliminate any node + assert new_collection[-3].sets_italics_off() + assert new_collection[-2].is_text_node() + assert new_collection[-1].sets_italics_on() + + def test_skip_redundant_italics_nodes(self): + from pycaption.scc.specialized_collections import ( + _InstructionNode, + _skip_redundant_italics_nodes, + ) - assert result[16].requires_repositioning() - assert result[17].sets_italics_on() - assert result[18].is_text_node() - assert result[19].is_explicit_break() - assert result[20].is_text_node() - assert result[21].is_explicit_break() - assert result[22].is_text_node() - assert result[23].sets_italics_off() + position_tracker = DefaultProvidingPositionTracker().default + node_creator = InstructionNodeCreator( + position_tracker=(DefaultProvidingPositionTracker()) + ) + + node_creator._collection.append( + _InstructionNode.create_italics_style(position_tracker, turn_on=False) + ) + node_creator._collection.append( + _InstructionNode.create_italics_style(position_tracker, turn_on=True) + ) + new_collection = _skip_redundant_italics_nodes(node_creator._collection) + # should remove italics off + assert len(new_collection) == 1 + assert new_collection[0].sets_italics_on() + + # test with text inbetween + node_creator = InstructionNodeCreator( + position_tracker=(DefaultProvidingPositionTracker()) + ) + node_creator._collection.append( + _InstructionNode.create_italics_style(position_tracker, turn_on=False) + ) + node_creator.add_chars("f", "o", "o") + node_creator._collection.append( + _InstructionNode.create_italics_style(position_tracker, turn_on=True) + ) + new_collection = _skip_redundant_italics_nodes(node_creator._collection) + # should remove italics off + assert len(new_collection) == 2 + assert new_collection[-1].sets_italics_on() + assert new_collection[-2].text == "foo" + + # test with same style + node_creator = InstructionNodeCreator( + position_tracker=(DefaultProvidingPositionTracker()) + ) + node_creator._collection.append( + _InstructionNode.create_italics_style(position_tracker, turn_on=True) + ) + node_creator.add_chars("f", "o", "o") + node_creator._collection.append( + _InstructionNode.create_italics_style(position_tracker, turn_on=True) + ) + node_creator._collection.append( + _InstructionNode.create_italics_style(position_tracker, turn_on=True) + ) + node_creator._collection.append( + _InstructionNode.create_italics_style(position_tracker, turn_on=False) + ) + node_creator._collection.append( + _InstructionNode.create_italics_style(position_tracker, turn_on=False) + ) + node_creator._collection.append( + _InstructionNode.create_italics_style(position_tracker, turn_on=True) + ) + new_collection = _skip_redundant_italics_nodes(node_creator._collection) + # should open italics once, write foo, close italics once then re-open italics + assert len(new_collection) == 4 + assert new_collection[-1].sets_italics_on() + assert new_collection[-2].sets_italics_off() + assert new_collection[-3].text == "foo" + assert new_collection[-4].sets_italics_on() + + def test_close_italics_before_repositioning(self): + from pycaption.scc.specialized_collections import ( + _close_italics_before_repositioning, + _InstructionNode, + ) + + position_tracker = DefaultProvidingPositionTracker().default + node_creator = InstructionNodeCreator( + position_tracker=(DefaultProvidingPositionTracker()) + ) + + # set italics + node_creator._collection.append( + _InstructionNode.create_italics_style(position_tracker, turn_on=True) + ) + node_creator.add_chars("f", "o", "o") + # reposition + node_creator._collection.append( + _InstructionNode.create_repositioning_command(position_tracker) + ) + node_creator.add_chars("b", "a", "r") + + new_collection = _close_italics_before_repositioning(node_creator._collection) + assert new_collection[0].sets_italics_on() + assert new_collection[1].text == "foo" + assert new_collection[2].sets_italics_off() + assert new_collection[3].requires_repositioning() + assert new_collection[4].sets_italics_on() + assert new_collection[5].text == "bar" + + def test_ensure_final_italics_node_closes(self): + from pycaption.scc.specialized_collections import ( + _ensure_final_italics_node_closes, + _InstructionNode, + ) + + position_tracker = DefaultProvidingPositionTracker().default + node_creator = InstructionNodeCreator( + position_tracker=(DefaultProvidingPositionTracker()) + ) + + # set italics + node_creator._collection.append( + _InstructionNode.create_italics_style(position_tracker, turn_on=True) + ) + node_creator.add_chars("f", "o", "o") + node_creator._collection.append( + _InstructionNode.create_repositioning_command(position_tracker) + ) + new_collection = _ensure_final_italics_node_closes(node_creator._collection) + + # it should close italics at the end + assert new_collection[-1].sets_italics_off() + + # let's add some breaks + node_creator._collection.append( + _InstructionNode.create_break(position=position_tracker) + ) + node_creator.add_chars("b", "a", "r") + node_creator._collection.append( + _InstructionNode.create_break(position=position_tracker) + ) + node_creator.add_chars("b", "a", "z") + new_collection = _ensure_final_italics_node_closes(node_creator._collection) + # it should close italics at the end + assert new_collection[-1].sets_italics_off() class CaptionDummy: @@ -462,7 +849,7 @@ def __init__(self, start=0, end=0, nodes=(1, 2)): self.end = end def __repr__(self): - return f'{self.start}-->{self.end}' + return f"{self.start}-->{self.end}" class TestTimingCorrectingCaptionList: @@ -554,15 +941,13 @@ def test_overwriting_end_time_difference_under_5_frames(self): expected_end_2 = 6 * second + 4 * MICROSECONDS_PER_CODEWORD - caption_list.extend([CaptionDummy(start=expected_end_2, - end=8 * second)]) + caption_list.extend([CaptionDummy(start=expected_end_2, end=8 * second)]) # Append then extend assert caption_list[-2].end == expected_end_2 expected_end_3 = 8 * second + 3 * MICROSECONDS_PER_CODEWORD - caption_list.extend([CaptionDummy(start=expected_end_3, - end=10 * second)]) + caption_list.extend([CaptionDummy(start=expected_end_3, end=10 * second)]) # Extend then extend assert caption_list[-2].end == expected_end_3 @@ -573,12 +958,11 @@ def test_overwriting_end_time_difference_under_5_frames(self): assert caption_list[-2].end == expected_end_4 def test_last_caption_zero_end_time_is_corrected( - self, sample_scc_no_explicit_end_to_last_caption): - caption_set = SCCReader().read( - sample_scc_no_explicit_end_to_last_caption - ) + self, sample_scc_no_explicit_end_to_last_caption + ): + caption_set = SCCReader().read(sample_scc_no_explicit_end_to_last_caption) - last_caption = caption_set.get_captions('en-US')[-1] + last_caption = caption_set.get_captions("en-US")[-1] assert last_caption.end == last_caption.start + 4 * 1000 * 1000 @@ -588,6 +972,6 @@ def test_eoc_first_command(self, sample_scc_eoc_first_command): caption_set = SCCReader().read(sample_scc_eoc_first_command) # just one caption, first EOC disappears - num_captions = len(caption_set.get_captions('en-US')) + num_captions = len(caption_set.get_captions("en-US")) assert num_captions == 2 diff --git a/tests/test_scc_conversion.py b/tests/test_scc_conversion.py index 67dc1fb9..ed7a81c9 100644 --- a/tests/test_scc_conversion.py +++ b/tests/test_scc_conversion.py @@ -1,9 +1,13 @@ import pytest from pycaption import ( - SCCReader, SCCWriter, SRTReader, SRTWriter, DFXPWriter, WebVTTWriter, + DFXPWriter, + SCCReader, + SCCWriter, + SRTReader, + SRTWriter, + WebVTTWriter, ) - from tests.mixins import CaptionSetTestingMixIn # This is quite fuzzy at the moment. @@ -31,17 +35,18 @@ def test_srt_to_scc_to_srt_conversion(self, sample_srt_ascii): class TestSCCtoDFXP: - def test_scc_to_dfxp(self, sample_dfxp_from_scc_output, - sample_scc_multiple_positioning): + def test_scc_to_dfxp( + self, sample_dfxp_from_scc_output, sample_scc_multiple_positioning + ): caption_set = SCCReader().read(sample_scc_multiple_positioning) - dfxp = DFXPWriter( - relativize=False, fit_to_screen=False).write(caption_set) - + dfxp = DFXPWriter(relativize=False, fit_to_screen=False).write(caption_set) assert sample_dfxp_from_scc_output == dfxp def test_dfxp_is_valid_xml_when_scc_source_has_weird_italic_commands( - self, sample_dfxp_with_properly_closing_spans_output, - sample_scc_created_dfxp_with_wrongly_closing_spans): + self, + sample_dfxp_with_properly_closing_spans_output, + sample_scc_created_dfxp_with_wrongly_closing_spans, + ): caption_set = SCCReader().read( sample_scc_created_dfxp_with_wrongly_closing_spans ) @@ -51,8 +56,8 @@ def test_dfxp_is_valid_xml_when_scc_source_has_weird_italic_commands( assert dfxp == sample_dfxp_with_properly_closing_spans_output def test_dfxp_is_valid_xml_when_scc_source_has_ampersand_character( - self, sample_dfxp_with_ampersand_character, - sample_scc_with_ampersand_character): + self, sample_dfxp_with_ampersand_character, sample_scc_with_ampersand_character + ): caption_set = SCCReader().read(sample_scc_with_ampersand_character) dfxp = DFXPWriter().write(caption_set) @@ -62,10 +67,11 @@ def test_dfxp_is_valid_xml_when_scc_source_has_ampersand_character( class TestSCCToWebVTT: def test_webvtt_newlines_are_properly_rendered( - self, sample_webvtt_from_scc_properly_writes_newlines_output, - scc_that_generates_webvtt_with_proper_newlines): - caption_set = SCCReader().read( - scc_that_generates_webvtt_with_proper_newlines) + self, + sample_webvtt_from_scc_properly_writes_newlines_output, + scc_that_generates_webvtt_with_proper_newlines, + ): + caption_set = SCCReader().read(scc_that_generates_webvtt_with_proper_newlines) webvtt = WebVTTWriter().write(caption_set) assert webvtt == sample_webvtt_from_scc_properly_writes_newlines_output