Skip to content

Commit

Permalink
feat(ingest): loosen airflow plugin dependencies requirements (#10106)
Browse files Browse the repository at this point in the history
  • Loading branch information
hsheth2 authored Mar 27, 2024
1 parent 654d991 commit 07ef677
Show file tree
Hide file tree
Showing 11 changed files with 131 additions and 69 deletions.
10 changes: 5 additions & 5 deletions .github/workflows/airflow-plugin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,16 @@ jobs:
extra_pip_requirements: "apache-airflow~=2.2.4"
extra_pip_extras: plugin-v1
- python-version: "3.10"
extra_pip_requirements: 'apache-airflow~=2.4.0 pluggy==1.0.0 "pendulum<3.0" "Flask-Session<0.6.0"'
extra_pip_extras: plugin-v2
extra_pip_requirements: "apache-airflow==2.4.3"
extra_pip_extras: plugin-v2,test-airflow24
- python-version: "3.10"
extra_pip_requirements: 'apache-airflow~=2.6.0 "pendulum<3.0" "Flask-Session<0.6.0"'
extra_pip_requirements: 'apache-airflow==2.6.3 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.6.3/constraints-3.10.txt'
extra_pip_extras: plugin-v2
- python-version: "3.10"
extra_pip_requirements: 'apache-airflow~=2.7.0 pydantic==2.4.2 "Flask-Session<0.6.0"'
extra_pip_requirements: 'apache-airflow==2.7.3 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.7.3/constraints-3.10.txt'
extra_pip_extras: plugin-v2
- python-version: "3.10"
extra_pip_requirements: 'apache-airflow>=2.8.0 pydantic>=2.4.2 "Flask-Session<0.6.0"'
extra_pip_requirements: 'apache-airflow==2.8.1 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.8.1/constraints-3.10.txt'
extra_pip_extras: plugin-v2
fail-fast: false
steps:
Expand Down
3 changes: 3 additions & 0 deletions metadata-ingestion-modules/airflow-plugin/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@

See [the DataHub Airflow docs](https://datahubproject.io/docs/lineage/airflow) for details.

## Developing

See the [developing docs](https://datahubproject.io/docs/next/metadata-ingestion/developing/).
31 changes: 19 additions & 12 deletions metadata-ingestion-modules/airflow-plugin/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,20 @@ task installDev(type: Exec, dependsOn: [install]) {
def sentinel_file = "${venv_name}/.build_install_dev_sentinel"
inputs.file file('setup.py')
outputs.file("${sentinel_file}")
// For some reason, the Airflow constraints files pin things like black and pytest.
// This allows us to not respect those constraints ourselves.
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
"${pip_install_command} -e .[dev${extra_pip_extras}] ${extra_pip_requirements} && " +
"${pip_install_command} -e .[dev${extra_pip_extras}] && " +
"touch ${sentinel_file}"
}
task installTest(type: Exec, dependsOn: [installDev]) {
def sentinel_file = "${venv_name}/.build_install_test_sentinel"
inputs.file file('setup.py')
outputs.file(sentinel_file)
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
"${pip_install_command} -e .[integration-tests${extra_pip_extras}] ${extra_pip_requirements} && " +
"touch ${sentinel_file}"
}

Expand All @@ -82,18 +93,14 @@ task lintFix(type: Exec, dependsOn: installDev) {
"mypy src/ tests/ "
}

task installDevTest(type: Exec, dependsOn: [installDev]) {
def sentinel_file = "${venv_name}/.build_install_dev_test_sentinel"
inputs.file file('setup.py')
outputs.dir("${venv_name}")
outputs.file("${sentinel_file}")
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
"${pip_install_command} -e .[dev,integration-tests${extra_pip_extras}] ${extra_pip_requirements} && " +
"touch ${sentinel_file}"
}
// HACK: Some of the Airflow constraint files conflict with packages that we install (e.g. black).
// I'm not sure why they constrain those in the first place, but we don't want to respect them
// when running linting. However, we do want to respect them when running integration tests.
// There's two install steps, and these constraints ensure that we run them in the right order.
installTest.shouldRunAfter lint
installTest.shouldRunAfter lintFix

task testQuick(type: Exec, dependsOn: installDevTest) {
task testQuick(type: Exec, dependsOn: installTest) {
inputs.files(project.fileTree(dir: "src/", include: "**/*.py"))
inputs.files(project.fileTree(dir: "tests/"))
commandLine 'bash', '-c',
Expand Down
42 changes: 23 additions & 19 deletions metadata-ingestion-modules/airflow-plugin/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,10 @@ def get_long_description():
"plugin-v1": set(),
"plugin-v2": {
f"acryl-datahub[sql-parser]{_self_pin}",
"openlineage-airflow==1.2.0",
# We remain restrictive on the versions allowed here to prevent
# us from being broken by backwards-incompatible changes in the
# underlying package.
"openlineage-airflow>=1.2.0,<=1.7.0",
},
}

Expand All @@ -56,7 +59,6 @@ def get_long_description():
"types-requests",
"types-toml",
"types-PyYAML",
"types-freezegun",
"types-cachetools",
# versions 0.1.13 and 0.1.14 seem to have issues
"types-click==0.1.12",
Expand All @@ -76,42 +78,40 @@ def get_long_description():
# See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910.
"pydantic>=1.10",
"pytest>=6.2.2",
"pytest-asyncio>=0.16.0",
"pytest-cov>=2.8.1",
"tox",
"tox-uv",
"deepdiff",
"tenacity",
"requests-mock",
"freezegun",
"jsonpickle",
"build",
"twine",
"packaging",
}

integration_test_requirements = {
*dev_requirements,
*plugins["datahub-file"],
*plugins["datahub-kafka"],
f"acryl-datahub[testing-utils]{_self_pin}",
# Extra requirements for loading our test dags.
"apache-airflow[snowflake]>=2.0.2",
# Connexion's new version breaks Airflow:
# See https://github.com/apache/airflow/issues/35234.
# TODO: We should transition to using Airflow's constraints file.
"connexion<3",
# https://github.com/snowflakedb/snowflake-sqlalchemy/issues/350
# Eventually we want to set this to "snowflake-sqlalchemy>=1.4.3".
# However, that doesn't work with older versions of Airflow. Instead
# of splitting this into integration-test-old and integration-test-new,
# adding a bound to SQLAlchemy was the simplest solution.
"sqlalchemy<1.4.42",
# To avoid https://github.com/snowflakedb/snowflake-connector-python/issues/1188,
# we need https://github.com/snowflakedb/snowflake-connector-python/pull/1193
# A collection of issues we've encountered:
# - Connexion's new version breaks Airflow:
# See https://github.com/apache/airflow/issues/35234.
# - https://github.com/snowflakedb/snowflake-sqlalchemy/issues/350
# Eventually we want to set this to "snowflake-sqlalchemy>=1.4.3".
# - To avoid https://github.com/snowflakedb/snowflake-connector-python/issues/1188,
# we need https://github.com/snowflakedb/snowflake-connector-python/pull/1193
"snowflake-connector-python>=2.7.10",
"virtualenv", # needed by PythonVirtualenvOperator
"apache-airflow-providers-sqlite",
}
per_version_test_requirements = {
"test-airflow24": {
"pendulum<3.0",
"Flask-Session<0.6.0",
"connexion<3.0",
},
}


entry_points = {
Expand Down Expand Up @@ -169,5 +169,9 @@ def get_long_description():
**{plugin: list(dependencies) for plugin, dependencies in plugins.items()},
"dev": list(dev_requirements),
"integration-tests": list(integration_test_requirements),
**{
plugin: list(dependencies)
for plugin, dependencies in per_version_test_requirements.items()
},
},
)
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
HAS_AIRFLOW_LISTENER_API,
HAS_AIRFLOW_STANDALONE_CMD,
)
from tests.utils import PytestConfig

pytestmark = pytest.mark.integration

Expand Down Expand Up @@ -241,7 +242,7 @@ def _run_airflow(


def check_golden_file(
pytestconfig: pytest.Config,
pytestconfig: PytestConfig,
output_path: pathlib.Path,
golden_path: pathlib.Path,
ignore_paths: Sequence[str] = (),
Expand Down Expand Up @@ -295,14 +296,18 @@ class DagTestCase:
*[
pytest.param(
# On Airflow 2.3-2.4, test plugin v2 without dataFlows.
f"v2_{test_case.dag_id}"
if HAS_AIRFLOW_DAG_LISTENER_API
else f"v2_{test_case.dag_id}_no_dag_listener",
(
f"v2_{test_case.dag_id}"
if HAS_AIRFLOW_DAG_LISTENER_API
else f"v2_{test_case.dag_id}_no_dag_listener"
),
test_case,
False,
id=f"v2_{test_case.dag_id}"
if HAS_AIRFLOW_DAG_LISTENER_API
else f"v2_{test_case.dag_id}_no_dag_listener",
id=(
f"v2_{test_case.dag_id}"
if HAS_AIRFLOW_DAG_LISTENER_API
else f"v2_{test_case.dag_id}_no_dag_listener"
),
marks=pytest.mark.skipif(
not HAS_AIRFLOW_LISTENER_API,
reason="Cannot test plugin v2 without the Airflow plugin listener API",
Expand All @@ -313,7 +318,7 @@ class DagTestCase:
],
)
def test_airflow_plugin(
pytestconfig: pytest.Config,
pytestconfig: PytestConfig,
tmp_path: pathlib.Path,
golden_filename: str,
test_case: DagTestCase,
Expand Down Expand Up @@ -367,12 +372,7 @@ def test_airflow_plugin(
output_path=airflow_instance.metadata_file,
golden_path=golden_path,
ignore_paths=[
# Timing-related items.
# r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['start_date'\]",
# r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['end_date'\]",
# r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['duration'\]",
# TODO: If we switched to Git urls, maybe we could get this to work consistently.
# r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['fileloc'\]",
r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['openlineage_.*'\]",
],
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from datahub_airflow_plugin.entities import Dataset, Urn
from datahub_airflow_plugin.hooks.datahub import DatahubKafkaHook, DatahubRestHook
from datahub_airflow_plugin.operators.datahub import DatahubEmitterOperator
from tests.utils import PytestConfig

assert AIRFLOW_PATCHED

Expand Down Expand Up @@ -74,7 +75,7 @@ def test_airflow_provider_info():


@pytest.mark.filterwarnings("ignore:.*is deprecated.*")
def test_dags_load_with_no_errors(pytestconfig: pytest.Config) -> None:
def test_dags_load_with_no_errors(pytestconfig: PytestConfig) -> None:
airflow_examples_folder = (
pytestconfig.rootpath / "src/datahub_airflow_plugin/example_dags"
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import pytest
import setuptools
from datahub.testing.check_imports import ensure_no_indirect_model_imports

from tests.utils import PytestConfig


def test_package_list_match_inits():
where = "./src"
Expand All @@ -10,7 +11,7 @@ def test_package_list_match_inits():
assert package_list == namespace_packages, "are you missing a package init file?"


def test_check_import_paths(pytestconfig: pytest.Config) -> None:
def test_check_import_paths(pytestconfig: PytestConfig) -> None:
root = pytestconfig.rootpath

ensure_no_indirect_model_imports([root / "src", root / "tests"])
7 changes: 7 additions & 0 deletions metadata-ingestion-modules/airflow-plugin/tests/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
try:
from pytest import Config as PytestConfig # type: ignore[attr-defined]
except ImportError:
# Support for pytest 6.x.
from _pytest.config import Config as PytestConfig # type: ignore

__all__ = ["PytestConfig"]
49 changes: 32 additions & 17 deletions metadata-ingestion-modules/airflow-plugin/tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -15,29 +15,44 @@ deps =
# Airflow version
airflow21: apache-airflow~=2.1.0
airflow22: apache-airflow~=2.2.0
# On Airflow 2.4 and 2.5, Airflow's constraints file pins pluggy to 1.0.0,
# which has caused issues for us before. As such, we now pin it explicitly
# to prevent regressions.
# See https://github.com/datahub-project/datahub/pull/9365
airflow24: apache-airflow~=2.4.0,pluggy==1.0.0
airflow24: apache-airflow~=2.4.0
airflow26: apache-airflow~=2.6.0
# Respect the constraints file on pendulum.
# See https://github.com/apache/airflow/issues/36274
airflow24,airflow26: pendulum>=2.0,<3.0
# The Airflow 2.7 constraints file points at pydantic v2, so we match that here.
# https://raw.githubusercontent.com/apache/airflow/constraints-2.7.3/constraints-3.10.txt
# Note that Airflow is actually compatible with both pydantic v1 and v2, and the
# constraints file is overly restrictive.
airflow27: apache-airflow~=2.7.0
airflow27: pydantic==2.4.2
airflow28: apache-airflow~=2.8.0
# Apparently Flask-Session 0.6.0 was released by accident.
# See https://github.com/pallets-eco/flask-session/issues/209
airflow24,airflow26,airflow27,airflow28: Flask-Session<0.6.0

# Respect the Airflow constraints files.
# We can't make ourselves work with the constraints of Airflow < 2.3.
py310-airflow24: -c https://raw.githubusercontent.com/apache/airflow/constraints-2.4.3/constraints-3.10.txt
py310-airflow26: -c https://raw.githubusercontent.com/apache/airflow/constraints-2.6.3/constraints-3.10.txt
py310-airflow27: -c https://raw.githubusercontent.com/apache/airflow/constraints-2.7.3/constraints-3.10.txt
py310-airflow28: -c https://raw.githubusercontent.com/apache/airflow/constraints-2.8.1/constraints-3.10.txt

# Before pinning to the constraint files, we previously left the dependencies
# more open. There were a number of packages for which this caused issues.
# We've left those changes in here, commented out, for reference.
; # On Airflow 2.4 and 2.5, Airflow's constraints file pins pluggy to 1.0.0,
; # which has caused issues for us before. As such, we now pin it explicitly
; # to prevent regressions.
; # See https://github.com/datahub-project/datahub/pull/9365
; airflow24: pluggy==1.0.0
; # Respect the constraints file on pendulum.
; # See https://github.com/apache/airflow/issues/36274
; airflow24,airflow26: pendulum>=2.0,<3.0
; # The Airflow 2.7 constraints file points at pydantic v2, so we match that here.
; # https://raw.githubusercontent.com/apache/airflow/constraints-2.7.3/constraints-3.10.txt
; # Note that Airflow is actually compatible with both pydantic v1 and v2, and the
; # constraints file is overly restrictive.
; airflow27: pydantic==2.4.2
; # Apparently Flask-Session 0.6.0 was released by accident.
; # See https://github.com/pallets-eco/flask-session/issues/209
; airflow24,airflow26,airflow27,airflow28: Flask-Session<0.6.0
commands =
pytest --cov-append {posargs}

# For Airflow 2.4+, add the plugin-v2 extra.
[testenv:py310-airflow{24,26,27,28}]
[testenv:py310-airflow24]
extras = dev,integration-tests,plugin-v2,test-airflow24

[testenv:py310-airflow{26,27,28}]
extras = dev,integration-tests,plugin-v2

24 changes: 24 additions & 0 deletions metadata-ingestion/developing.md
Original file line number Diff line number Diff line change
Expand Up @@ -218,3 +218,27 @@ For example,
```shell
pytest tests/integration/dbt/test_dbt.py --update-golden-files
```

### Testing the Airflow plugin

For the Airflow plugin, we use `tox` to test across multiple sets of dependencies.

```sh
cd metadata-ingestion-modules/airflow-plugin

# Run all tests.
tox

# Run a specific environment.
# These are defined in the `tox.ini` file
tox -e py310-airflow26

# Run a specific test.
tox -e py310-airflow26 -- tests/integration/test_plugin.py

# Update all golden files.
tox -- --update-golden-files

# Update golden files for a specific environment.
tox -e py310-airflow26 -- --update-golden-files
```

0 comments on commit 07ef677

Please sign in to comment.