From c6b0ea6c5fd27f83a9bd63359752cc155e651b16 Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Thu, 27 Jun 2024 12:44:56 +0100 Subject: [PATCH] Add --ignore-multiline-regex option. Allows exclusion of multiline regions. Matching regions are replaced with blank lines. --- codespell_lib/_codespell.py | 62 +++++++++++++++++++++++++++++-- codespell_lib/tests/test_basic.py | 37 ++++++++++++++++++ pyproject.toml | 4 +- 3 files changed, 97 insertions(+), 6 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index f5070dc2d9..0ff66c822f 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -36,6 +36,7 @@ Pattern, Sequence, Set, + TextIO, Tuple, ) @@ -201,11 +202,17 @@ def __str__(self) -> str: class FileOpener: - def __init__(self, use_chardet: bool, quiet_level: int) -> None: + def __init__( + self, + use_chardet: bool, + quiet_level: int, + ignore_multiline_regex: Optional[Pattern[str]], + ) -> None: self.use_chardet = use_chardet if use_chardet: self.init_chardet() self.quiet_level = quiet_level + self.ignore_multiline_regex = ignore_multiline_regex def init_chardet(self) -> None: try: @@ -247,7 +254,7 @@ def open_with_chardet(self, filename: str) -> Tuple[List[str], str]: ) raise else: - lines = f.readlines() + lines = self.get_lines(f) f.close() return lines, f.encoding @@ -262,7 +269,7 @@ def open_with_internal(self, filename: str) -> Tuple[List[str], str]: print(f'WARNING: Trying next encoding "{encoding}"', file=sys.stderr) with open(filename, encoding=encoding, newline="") as f: try: - lines = f.readlines() + lines = self.get_lines(f) except UnicodeDecodeError: if not self.quiet_level & QuietLevels.ENCODING: print( @@ -279,6 +286,22 @@ def open_with_internal(self, filename: str) -> Tuple[List[str], str]: return lines, encoding + def get_lines(self, f: TextIO) -> List[str]: + if self.ignore_multiline_regex: + text = f.read() + pos = 0 + text2 = "" + for m in re.finditer(self.ignore_multiline_regex, text): + text2 += text[pos : m.start()] + # Replace with blank lines so line numbers are unchanged. + text2 += "\n" * m.group().count("\n") + pos = m.end() + text2 += text[pos:] + lines = text2.split("\n") + else: + lines = f.readlines() + return lines + # -.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:- @@ -411,6 +434,19 @@ def parse_options( 'e.g., "\\bmatch\\b". Defaults to ' "empty/disabled.", ) + parser.add_argument( + "--ignore-multiline-regex", + action="store", + type=str, + help="regular expression that is used to ignore " + "text that may span multi-line regions. " + "The regex is run with re.DOTALL. For example to " + "allow skipping of regions of Python code using " + "begin/end comments one could use: " + "--ignore-multiline-regex " + "'# codespell:ignore-begin *\\n.*# codespell:ignore-end *\\n'. " + "Defaults to empty/disabled.", + ) parser.add_argument( "-I", "--ignore-words", @@ -1115,6 +1151,20 @@ def main(*args: str) -> int: else: ignore_word_regex = None + if options.ignore_multiline_regex: + try: + ignore_multiline_regex = re.compile( + options.ignore_multiline_regex, re.DOTALL + ) + except re.error as e: + return _usage_error( + parser, + f"ERROR: invalid --ignore-multiline-regex " + f'"{options.ignore_multiline_regex}" ({e})', + ) + else: + ignore_multiline_regex = None + ignore_words, ignore_words_cased = parse_ignore_words_option( options.ignore_words_list ) @@ -1203,7 +1253,11 @@ def main(*args: str) -> int: for exclude_file in exclude_files: build_exclude_hashes(exclude_file, exclude_lines) - file_opener = FileOpener(options.hard_encoding_detection, options.quiet_level) + file_opener = FileOpener( + options.hard_encoding_detection, + options.quiet_level, + ignore_multiline_regex, + ) glob_match = GlobMatch( flatten_clean_comma_separated_arguments(options.skip) if options.skip else [] diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py index 98e5dd41f0..e0e71ec8e2 100644 --- a/codespell_lib/tests/test_basic.py +++ b/codespell_lib/tests/test_basic.py @@ -942,6 +942,43 @@ def test_ignore_regex_option( assert cs.main(fname, r"--ignore-regex=\bdonn\b") == 1 +def test_ignore_multiline_regex_option( + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + """Test ignore regex option functionality.""" + + # Invalid regex. + result = cs.main("--ignore-multiline-regex=(", std=True) + assert isinstance(result, tuple) + code, stdout, _ = result + assert code == EX_USAGE + assert "usage:" in stdout + + fname = tmp_path / "flag.txt" + fname.write_text( + """ + Please see http://example.com/abandonned for info + # codespell:ignore-begin + ''' + abandonned + abandonned + ''' + # codespell:ignore-end + abandonned + """ + ) + assert cs.main(fname) == 4 + assert ( + cs.main( + fname, + "--ignore-multiline-regex", + "codespell:ignore-begin.*codespell:ignore-end", + ) + == 2 + ) + + def test_uri_regex_option( tmp_path: Path, capsys: pytest.CaptureFixture[str], diff --git a/pyproject.toml b/pyproject.toml index 87fa3c4b2e..feeb1e186e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -169,6 +169,6 @@ max-complexity = 45 [tool.ruff.lint.pylint] allow-magic-value-types = ["bytes", "int", "str",] max-args = 13 -max-branches = 46 -max-returns = 11 +max-branches = 47 +max-returns = 12 max-statements = 119