diff --git a/airflow/utils/file.py b/airflow/utils/file.py index 89d0b7d9fc97a..5a3db7fd48f9b 100644 --- a/airflow/utils/file.py +++ b/airflow/utils/file.py @@ -40,11 +40,14 @@ class _IgnoreRule(Protocol): @staticmethod def compile(pattern: str, base_dir: Path, definition_file: Path) -> Optional['_IgnoreRule']: - pass + """ + Build an ignore rule from the supplied pattern where base_dir + and definition_file should be absolute paths. + """ @staticmethod def match(path: Path, rules: List['_IgnoreRule']) -> bool: - pass + """Match a candidate absolute path against a list of rules""" class _RegexpIgnoreRule(NamedTuple): @@ -57,7 +60,7 @@ class _RegexpIgnoreRule(NamedTuple): def compile(pattern: str, base_dir: Path, definition_file: Path) -> Optional[_IgnoreRule]: """Build an ignore rule from the supplied regexp pattern and log a useful warning if it is invalid""" try: - return _RegexpIgnoreRule(re.compile(pattern), base_dir.resolve()) + return _RegexpIgnoreRule(re.compile(pattern), base_dir) except re.error as e: log.warning("Ignoring invalid regex '%s' from %s: %s", pattern, definition_file, e) return None @@ -65,11 +68,10 @@ def compile(pattern: str, base_dir: Path, definition_file: Path) -> Optional[_Ig @staticmethod def match(path: Path, rules: List[_IgnoreRule]) -> bool: """Match a list of ignore rules against the supplied path""" - test_path: Path = path.resolve() for rule in rules: if not isinstance(rule, _RegexpIgnoreRule): raise ValueError(f"_RegexpIgnoreRule cannot match rules of type: {type(rule)}") - if rule.pattern.search(str(test_path.relative_to(rule.base_dir))) is not None: + if rule.pattern.search(str(path.relative_to(rule.base_dir))) is not None: return True return False @@ -95,21 +97,20 @@ def compile(pattern: str, _, definition_file: Path) -> Optional[_IgnoreRule]: # > If there is a separator at the beginning or middle (or both) of the pattern, then the # > pattern is relative to the directory level of the particular .gitignore file itself. # > Otherwise the pattern may also match at any level below the .gitignore level. - relative_to = definition_file.resolve().parent + relative_to = definition_file.parent ignore_pattern = GitWildMatchPattern(pattern) return _GlobIgnoreRule(ignore_pattern.regex, pattern, ignore_pattern.include, relative_to) @staticmethod def match(path: Path, rules: List[_IgnoreRule]) -> bool: """Match a list of ignore rules against the supplied path""" - test_path: Path = path.resolve() matched = False for r in rules: if not isinstance(r, _GlobIgnoreRule): raise ValueError(f"_GlobIgnoreRule cannot match rules of type: {type(r)}") rule: _GlobIgnoreRule = r # explicit typing to make mypy play nicely - rel_path = str(test_path.relative_to(rule.relative_to) if rule.relative_to else test_path.name) - if rule.raw_pattern.endswith("/") and test_path.is_dir(): + rel_path = str(path.relative_to(rule.relative_to) if rule.relative_to else path.name) + if rule.raw_pattern.endswith("/") and path.is_dir(): # ensure the test path will potentially match a directory pattern if it is a directory rel_path += "/" if rule.include is not None and rule.pattern.match(rel_path) is not None: @@ -208,10 +209,11 @@ def _find_path_from_directory( :return: a generator of file paths which should not be ignored. """ + # A Dict of patterns, keyed using resolved, absolute paths patterns_by_dir: Dict[Path, List[_IgnoreRule]] = {} for root, dirs, files in os.walk(base_dir_path, followlinks=True): - patterns: List[_IgnoreRule] = patterns_by_dir.get(Path(root), []) + patterns: List[_IgnoreRule] = patterns_by_dir.get(Path(root).resolve(), []) ignore_file_path = Path(root) / ignore_file_name if ignore_file_path.is_file(): @@ -233,7 +235,15 @@ def _find_path_from_directory( dirs[:] = [subdir for subdir in dirs if not ignore_rule_type.match(Path(root) / subdir, patterns)] - patterns_by_dir.update({Path(root) / sd: patterns.copy() for sd in dirs}) + # explicit loop for infinite recursion detection since we are following symlinks in this walk + for sd in dirs: + dirpath = (Path(root) / sd).resolve() + if dirpath in patterns_by_dir: + raise RuntimeError( + "Detected recursive loop when walking DAG directory " + + f"{base_dir_path}: {dirpath} has appeared more than once." + ) + patterns_by_dir.update({dirpath: patterns.copy()}) for file in files: if file == ignore_file_name: diff --git a/tests/utils/test_file.py b/tests/utils/test_file.py index c79836b58a9c4..99f7e90a7d033 100644 --- a/tests/utils/test_file.py +++ b/tests/utils/test_file.py @@ -16,10 +16,14 @@ # specific language governing permissions and limitations # under the License. +import os import os.path import unittest +from pathlib import Path from unittest import mock +import pytest + from airflow.utils.file import correct_maybe_zipped, find_path_from_directory, open_maybe_zipped from tests.models import TEST_DAGS_FOLDER @@ -77,7 +81,24 @@ def test_open_maybe_zipped_archive(self): assert isinstance(content, str) -class TestListPyFilesPath(unittest.TestCase): +class TestListPyFilesPath: + @pytest.fixture() + def test_dir(self, tmp_path): + # create test tree with symlinks + source = os.path.join(tmp_path, "folder") + target = os.path.join(tmp_path, "symlink") + py_file = os.path.join(source, "hello_world.py") + ignore_file = os.path.join(tmp_path, ".airflowignore") + os.mkdir(source) + os.symlink(source, target) + # write ignore files + with open(ignore_file, 'w') as f: + f.write("folder") + # write sample pyfile + with open(py_file, 'w') as f: + f.write("print('hello world')") + return tmp_path + def test_find_path_from_directory_regex_ignore(self): should_ignore = [ "test_invalid_cron.py", @@ -110,3 +131,36 @@ def test_find_path_from_directory_glob_ignore(self): assert len(list(filter(lambda file: os.path.basename(file) in should_not_ignore, files))) == len( should_not_ignore ) + + def test_find_path_from_directory_respects_symlinks_regexp_ignore(self, test_dir): + ignore_list_file = ".airflowignore" + found = list(find_path_from_directory(test_dir, ignore_list_file)) + + assert os.path.join(test_dir, "symlink", "hello_world.py") in found + assert os.path.join(test_dir, "folder", "hello_world.py") not in found + + def test_find_path_from_directory_respects_symlinks_glob_ignore(self, test_dir): + ignore_list_file = ".airflowignore" + found = list(find_path_from_directory(test_dir, ignore_list_file, ignore_file_syntax="glob")) + + assert os.path.join(test_dir, "symlink", "hello_world.py") in found + assert os.path.join(test_dir, "folder", "hello_world.py") not in found + + def test_find_path_from_directory_fails_on_recursive_link(self, test_dir): + # add a recursive link + recursing_src = os.path.join(test_dir, "folder2", "recursor") + recursing_tgt = os.path.join(test_dir, "folder2") + os.mkdir(recursing_tgt) + os.symlink(recursing_tgt, recursing_src) + + ignore_list_file = ".airflowignore" + + try: + list(find_path_from_directory(test_dir, ignore_list_file, ignore_file_syntax="glob")) + assert False, "Walking a self-recursive tree should fail" + except RuntimeError as err: + assert ( + str(err) + == f"Detected recursive loop when walking DAG directory {test_dir}: " + + f"{Path(recursing_tgt).resolve()} has appeared more than once." + )