real_projects: Rewrite python-algorithms test to work on Windows

The python-algortihms project highlights an outstanding issue in FawltyDeps: Placeholder packages. As discussed in issue #176, these are packages that don't provide the expected import name (or any import name at all), but instead depend on other packages (transitive dependencies) which will then supply the relevant import name. The first example we came across of such a package (in #176) was qiskit. Another example is tensorflow, but only on Windows: On Windows the tensorflow package is a 1.9kB placeholder that depends on tensorflow-intel (266MB) to bring in the actual meat. (On POSIX, the tensorflow package itself contains everything.) We make two changes to our real_projects test for python-algorithms: 1. We add the posix_only flag to the all_reqs_installed experiment, to prevent it from running on Windows. Its expected results would be different there (because of tensorflow-intel). 2. We add a new experiment, some_reqs_customized, which uses a custom_mapping to resolve qiskit and tensorflow. The remaining requirements are installed (as in the previous experiment). This yields the "best" FawltyDeps result for this project. The remaining undeclared/unused dependencies are presumed to be true positives for this project.
tweag · Jan 19, 2024 · 43b7b62 · 43b7b62
1 parent d75ee76
commit 43b7b62
Show file tree

Hide file tree

Showing 2 changed files with 152 additions and 83 deletions.
diff --git a/tests/real_projects/python-algorithms.toml b/tests/real_projects/python-algorithms.toml
@@ -3,9 +3,9 @@
 # and where to find the relevant tarball, along with its expected checksum.
 name = "The Algorithms - Python"
 description = """
-  All algorithms implemented in Python - for education. 
-  One of the most popular (stars count) Python packages on GitHub.
-  """
+    All algorithms implemented in Python - for education. 
+    One of the most popular (stars count) Python packages on GitHub.
+"""
 url = "https://github.com/TheAlgorithms/Python/archive/77b4fa8b3f2070ff708405cca1381b7860e316ab.tar.gz"
 sha256 = "f46e3afeef27bb09d66fda69f50f562289fcfb5993c2e799e5765862b9f6c9f2"
 # The SHA256 checksum above can be found by running `sha256sum` on the
@@ -23,87 +23,99 @@ sha256 = "f46e3afeef27bb09d66fda69f50f562289fcfb5993c2e799e5765862b9f6c9f2"
 # missing fields.
 
 [experiments.all]
-description = "Running FD on the entire The Algorithms - Python project"
+description = """
+    Running FD on the entire TheAlgorithms/Python project, but w/o installing
+    any of its requirements. This relies heavily on IdentityMapping, and
+    exposes its weaknesses.
+"""
 args = []
 # When we run FawltyDeps with the above arguments, we expect these results:
 imports= [
-  "PIL",
-  "bs4",
-  "cv2",
-  "django",
-  "fake_useragent",
-  "lxml",
-  "matplotlib",
-  "mpmath",
-  "numpy",
-  "pandas",
-  "pytest",
-  "qiskit",
-  "requests",
-  "rich",
-  "scipy",
-  "seaborn",
-  "skfuzzy",
-  "sklearn",
-  "statsmodels",
-  "sympy",
-  "tensorflow",
-  "tweepy",
-  "xgboost"
+    "PIL",
+    "bs4",
+    "cv2",
+    "django",
+    "fake_useragent",
+    "lxml",
+    "matplotlib",
+    "mpmath",
+    "numpy",
+    "pandas",
+    "pytest",
+    "qiskit",
+    "requests",
+    "rich",
+    "scipy",
+    "seaborn",
+    "skfuzzy",
+    "sklearn",
+    "statsmodels",
+    "sympy",
+    "tensorflow",
+    "tweepy",
+    "xgboost",
 ]
 
 declared_deps = [
-  "beautifulsoup4",
-  "fake_useragent",
-  "keras",
-  "lxml",
-  "matplotlib",
-  "numpy",
-  "opencv-python",
-  "pandas",
-  "pillow",
-  "projectq",
-  "qiskit",
-  "requests",
-  "rich",
-  "scikit-fuzzy",
-  "scikit-learn",
-  "statsmodels",
-  "sympy",
-  "tensorflow",
-  "texttable",
-  "tweepy",
-  "xgboost",
-  "yulewalker",
+    "beautifulsoup4",
+    "fake_useragent",
+    "keras",
+    "lxml",
+    "matplotlib",
+    "numpy",
+    "opencv-python",
+    "pandas",
+    "pillow",
+    "projectq",
+    "qiskit",
+    "requests",
+    "rich",
+    "scikit-fuzzy",
+    "scikit-learn",
+    "statsmodels",
+    "sympy",
+    "tensorflow",
+    "texttable",
+    "tweepy",
+    "xgboost",
+    "yulewalker",
 ]
 
 undeclared_deps = [
-  "PIL",
-  "bs4",
-  "cv2",
-  "django",
-  "mpmath",
-  "pytest",
-  "scipy",
-  "seaborn",
-  "skfuzzy",
-  "sklearn"
+    "PIL",
+    "bs4",
+    "cv2",
+    "django",
+    "mpmath",
+    "pytest",
+    "scipy",
+    "seaborn",
+    "skfuzzy",
+    "sklearn",
 ]
 
 unused_deps = [
-  "beautifulsoup4",
-  "keras",
-  "opencv-python",
-  "pillow",
-  "projectq",
-  "scikit-fuzzy",
-  "scikit-learn",
-  "texttable",
-  "yulewalker"
+    "beautifulsoup4",
+    "keras",
+    "opencv-python",
+    "pillow",
+    "projectq",
+    "scikit-fuzzy",
+    "scikit-learn",
+    "texttable",
+    "yulewalker",
 ]
 
 [experiments.all_reqs_installed]
-description = "Running FD on the entire The Algorithms - Python project"
+description = """
+    Running FD on the TheAlgorithms/Python project, with all requirements
+    installed. This improves the situation somewhat, compared to the above, but
+    there are still several problems: qiskit and tensorflow (on Windows only)
+    are "placeholder packages" that rely on transitive dependencies to provide
+    their expected import names. Additionally, there appears to be several true
+    undeclared unused deps.
+"""
+posix_only = true
 args = []
 requirements = [
     "beautifulsoup4",
@@ -127,7 +139,7 @@ requirements = [
     "texttable",
     "tweepy",
     "xgboost",
-    "yulewalker"
+    "yulewalker",
 ]
 # When we run FawltyDeps with the above arguments, we expect these results:
 undeclared_deps = [
@@ -136,14 +148,80 @@ undeclared_deps = [
     "pytest",
     "qiskit",
     "scipy",
-    "seaborn"
+    "seaborn",
 ]
 
 unused_deps = [
     "keras",
     "projectq",
     "qiskit",
     "texttable",
-    "yulewalker"
+    "yulewalker",
+]
+
+[experiments.some_reqs_customized]
+description = """
+    Running FD on the TheAlgorithms/Python project, with some requirements
+    resolved via custom_mapping, and all other requirements installed.
+    This solved the "placeholder package" problem for qiskit and tensorflow
+    by side-stepping it with a custom_mapping.
+"""
+# TheAlgorithms/Python depends on a couple of "placeholder" packages:
+# - The qiskit package does not provide the "qiskit" import name, but depends
+#   on other packages to populate the "qiskit" namespace.
+# - The same is true for tensorflow on Windows: The package itself is only 1.9kB
+#   and does not provide any import names at all, rather it depends on another
+#   package, tensorflow-intel, which contains the actual meat (266MB).
+#
+# Since we `pip install` with `--no-deps`, the dependencies of these packages
+# are not automatically present in our Python environment. But even if they
+# were, FawltyDeps would still not be able to recognize the connection between
+# the placeholder package and its transitive dependencies, and would therefore
+# still report the placeholder as _both_ an undeclared and unused dependency!
+#
+# The best way to work around this is to provide a custom mapping to resolve
+# these placeholders as they are intended to be used. We configure the custom
+# mapping by pointing to THIS file as a config file for FawltyDeps, which will
+# then parse the [tool.fawltydeps.custom_mapping] section below.
+args = ["--config-file", "$REAL_PROJECTS_DIR/python-algorithms.toml"]
+requirements = [
+    "beautifulsoup4",
+    "fake_useragent",
+    "keras",
+    "lxml",
+    "matplotlib",
+    "numpy",
+    "opencv-python",
+    "pandas",
+    "pillow",
+    "projectq",
+    "requests",
+    "rich",
+    "scikit-fuzzy",
+    "scikit-learn",
+    "statsmodels",
+    "sympy",
+    "texttable",
+    "tweepy",
+    "xgboost",
+    "yulewalker",
+]
+# When we run FawltyDeps with the above arguments, we expect these results:
+undeclared_deps = [
+    "django",
+    "mpmath",
+    "pytest",
+    "scipy",
+    "seaborn",
+]
+
+unused_deps = [
+    "keras",
+    "projectq",
+    "texttable",
+    "yulewalker",
 ]
 
+[tool.fawltydeps.custom_mapping]
+qiskit = ["qiskit"]
+tensorflow = ["tensorflow"]
diff --git a/tests/test_real_projects.py b/tests/test_real_projects.py
@@ -57,12 +57,8 @@ def run_fawltydeps_json(
     argv = [sys.executable, "-I", "-m", "fawltydeps", "--json"]
     if venv_dir is not None:
         argv += [f"--pyenv={venv_dir}", "--pyenv=."]
-    proc = subprocess.run(
-        argv + list(args),
-        stdout=subprocess.PIPE,
-        check=False,
-        cwd=cwd,
-    )
+    argv += [arg.replace("$REAL_PROJECTS_DIR", str(REAL_PROJECTS_DIR)) for arg in args]
+    proc = subprocess.run(argv, stdout=subprocess.PIPE, check=False, cwd=cwd)
     # Check if return code does not indicate error (see main.main for the full list)
     assert proc.returncode in {0, 3, 4}
     return json.loads(proc.stdout)  # type: ignore
@@ -172,11 +168,6 @@ def unpacked_project_dir(self, cache: pytest.Cache) -> Path:
         return Path(cache.mkdir(f"fawltydeps_{self.tarball.sha256}"))
 
 
-@pytest.mark.skipif(
-    sys.platform.startswith("win"),
-    reason="Real projects test are not supported on Windows"
-    " due to the test environment complications.",
-)
 @pytest.mark.parametrize(
     "project, experiment",
     [