From 48b5a6221c8a203268e8905de3238d9d47411f75 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 5 Dec 2024 11:32:31 -0500 Subject: [PATCH] feat(ingest): add urn validation test files (#12036) --- .../tests/unit/urns/invalid_urns.txt | 40 +++++++++ .../tests/unit/urns/test_corp_group_urn.py | 10 --- .../tests/unit/urns/test_corpuser_urn.py | 10 --- .../tests/unit/urns/test_data_flow_urn.py | 8 -- .../tests/unit/urns/test_data_job_urn.py | 15 ---- .../urns/test_data_process_instance_urn.py | 10 --- .../tests/unit/urns/test_dataset_urn.py | 20 ----- .../tests/unit/urns/test_domain_urn.py | 8 -- .../tests/unit/urns/test_notebook_urn.py | 10 --- .../tests/unit/urns/test_tag_urn.py | 8 -- .../tests/unit/urns/test_urn.py | 88 +++++++++++-------- .../tests/unit/urns/valid_urns.txt | 24 +++++ 12 files changed, 115 insertions(+), 136 deletions(-) create mode 100644 metadata-ingestion/tests/unit/urns/invalid_urns.txt create mode 100644 metadata-ingestion/tests/unit/urns/valid_urns.txt diff --git a/metadata-ingestion/tests/unit/urns/invalid_urns.txt b/metadata-ingestion/tests/unit/urns/invalid_urns.txt new file mode 100644 index 00000000000000..9ce2c99a1a4ee8 --- /dev/null +++ b/metadata-ingestion/tests/unit/urns/invalid_urns.txt @@ -0,0 +1,40 @@ +# Basic URN format tests +urn:li:abc +urn:li:abc: +urn:li:abc:() +urn:li:abc:(abc,) +urn:li:corpuser:abc) + +# Reserved characters +urn:li:corpuser:foo␟bar +urn:li:tag:a,b,c + +# CorpUser URN tests +urn:li:corpuser:(part1,part2) + +# Dataset URN tests +urn:li:dataset:(urn:li:user:abc,dataset,prod) +urn:li:dataset:(urn:li:user:abc,dataset) +urn:li:dataset:(urn:li:user:abc,dataset,invalidEnv) + +# DataFlow URN tests +urn:li:dataFlow:(airflow,flow_id) + +# DataJob URN tests +urn:li:dataJob:(urn:li:user:abc,job_id) +urn:li:dataJob:(urn:li:dataFlow:(airflow,flow_id,prod)) + +# Domain URN tests +urn:li:domain:(part1,part2) + +# Tag URN tests +urn:li:tag:(part1,part2) + +# Notebook URN tests +urn:li:notebook:(part1,part2,part3) + +# CorpGroup URN tests +urn:li:corpGroup:(part1,part2) + +# DataProcessInstance URN tests +urn:li:dataProcessInstance:(part1,part2) diff --git a/metadata-ingestion/tests/unit/urns/test_corp_group_urn.py b/metadata-ingestion/tests/unit/urns/test_corp_group_urn.py index 1897a0e8686f09..4e55e78255d1c1 100644 --- a/metadata-ingestion/tests/unit/urns/test_corp_group_urn.py +++ b/metadata-ingestion/tests/unit/urns/test_corp_group_urn.py @@ -3,7 +3,6 @@ import pytest from datahub.utilities.urns.corp_group_urn import CorpGroupUrn -from datahub.utilities.urns.error import InvalidUrnError @pytest.mark.filterwarnings("ignore::DeprecationWarning") @@ -17,12 +16,3 @@ def test_parse_urn(self) -> None: assert str(corp_group_urn) == corp_group_urn_str assert corp_group_urn == CorpGroupUrn(name="abc") assert corp_group_urn == CorpGroupUrn.create_from_id("abc") - - def test_invalid_urn(self) -> None: - with self.assertRaises(InvalidUrnError): - CorpGroupUrn.create_from_string( - "urn:li:abc:(urn:li:dataPlatform:abc,def,prod)" - ) - - with self.assertRaises(InvalidUrnError): - CorpGroupUrn.create_from_string("urn:li:corpGroup:(part1,part2)") diff --git a/metadata-ingestion/tests/unit/urns/test_corpuser_urn.py b/metadata-ingestion/tests/unit/urns/test_corpuser_urn.py index 7a2a4f4ff4493c..e4a11b4f404c6e 100644 --- a/metadata-ingestion/tests/unit/urns/test_corpuser_urn.py +++ b/metadata-ingestion/tests/unit/urns/test_corpuser_urn.py @@ -3,7 +3,6 @@ import pytest from datahub.utilities.urns.corpuser_urn import CorpuserUrn -from datahub.utilities.urns.error import InvalidUrnError @pytest.mark.filterwarnings("ignore::DeprecationWarning") @@ -17,12 +16,3 @@ def test_parse_urn(self) -> None: assert str(corpuser_urn) == corpuser_urn_str assert corpuser_urn == CorpuserUrn("abc") assert corpuser_urn == CorpuserUrn.create_from_id("abc") - - def test_invalid_urn(self) -> None: - with self.assertRaises(InvalidUrnError): - CorpuserUrn.create_from_string( - "urn:li:abc:(urn:li:dataPlatform:abc,def,prod)" - ) - - with self.assertRaises(InvalidUrnError): - CorpuserUrn.create_from_string("urn:li:corpuser:(part1,part2)") diff --git a/metadata-ingestion/tests/unit/urns/test_data_flow_urn.py b/metadata-ingestion/tests/unit/urns/test_data_flow_urn.py index 524411121d418b..edb5563c5b22e3 100644 --- a/metadata-ingestion/tests/unit/urns/test_data_flow_urn.py +++ b/metadata-ingestion/tests/unit/urns/test_data_flow_urn.py @@ -3,7 +3,6 @@ import pytest from datahub.utilities.urns.data_flow_urn import DataFlowUrn -from datahub.utilities.urns.error import InvalidUrnError @pytest.mark.filterwarnings("ignore::DeprecationWarning") @@ -16,10 +15,3 @@ def test_parse_urn(self) -> None: assert data_flow_urn.get_env() == "prod" assert data_flow_urn.__str__() == "urn:li:dataFlow:(airflow,def,prod)" assert data_flow_urn == DataFlowUrn("airflow", "def", "prod") - - def test_invalid_urn(self) -> None: - with self.assertRaises(InvalidUrnError): - DataFlowUrn.create_from_string("urn:li:abc:(airflow,def,prod)") - - with self.assertRaises(InvalidUrnError): - DataFlowUrn.create_from_string("urn:li:dataFlow:(airflow,flow_id)") diff --git a/metadata-ingestion/tests/unit/urns/test_data_job_urn.py b/metadata-ingestion/tests/unit/urns/test_data_job_urn.py index bf039cd2a91f96..484e5a474c0cd2 100644 --- a/metadata-ingestion/tests/unit/urns/test_data_job_urn.py +++ b/metadata-ingestion/tests/unit/urns/test_data_job_urn.py @@ -4,7 +4,6 @@ from datahub.utilities.urns.data_flow_urn import DataFlowUrn from datahub.utilities.urns.data_job_urn import DataJobUrn -from datahub.utilities.urns.error import InvalidUrnError @pytest.mark.filterwarnings("ignore::DeprecationWarning") @@ -22,17 +21,3 @@ def test_parse_urn(self) -> None: assert data_job_urn == DataJobUrn( "urn:li:dataFlow:(airflow,flow_id,prod)", "job_id" ) - - def test_invalid_urn(self) -> None: - with self.assertRaises(InvalidUrnError): - DataJobUrn.create_from_string( - "urn:li:abc:(urn:li:dataFlow:(airflow,flow_id,prod),job_id)" - ) - - with self.assertRaises(InvalidUrnError): - DataJobUrn.create_from_string("urn:li:dataJob:(urn:li:user:abc,job_id)") - - with self.assertRaises(InvalidUrnError): - DataJobUrn.create_from_string( - "urn:li:dataJob:(urn:li:dataFlow:(airflow,flow_id,prod))" - ) diff --git a/metadata-ingestion/tests/unit/urns/test_data_process_instance_urn.py b/metadata-ingestion/tests/unit/urns/test_data_process_instance_urn.py index a86f8dd99416ff..f9087b19b13c32 100644 --- a/metadata-ingestion/tests/unit/urns/test_data_process_instance_urn.py +++ b/metadata-ingestion/tests/unit/urns/test_data_process_instance_urn.py @@ -3,7 +3,6 @@ import pytest from datahub.utilities.urns.data_process_instance_urn import DataProcessInstanceUrn -from datahub.utilities.urns.error import InvalidUrnError @pytest.mark.filterwarnings("ignore::DeprecationWarning") @@ -20,12 +19,3 @@ def test_parse_urn(self) -> None: assert dataprocessinstance_urn == DataProcessInstanceUrn("abc") assert dataprocessinstance_urn == DataProcessInstanceUrn.create_from_id("abc") assert "abc" == dataprocessinstance_urn.get_dataprocessinstance_id() - - def test_invalid_urn(self) -> None: - with self.assertRaises(InvalidUrnError): - DataProcessInstanceUrn.create_from_string("urn:li:abc:dataProcessInstance") - - with self.assertRaises(InvalidUrnError): - DataProcessInstanceUrn.create_from_string( - "urn:li:dataProcessInstance:(part1,part2)" - ) diff --git a/metadata-ingestion/tests/unit/urns/test_dataset_urn.py b/metadata-ingestion/tests/unit/urns/test_dataset_urn.py index 53065143a6ae4f..1be5cd59152009 100644 --- a/metadata-ingestion/tests/unit/urns/test_dataset_urn.py +++ b/metadata-ingestion/tests/unit/urns/test_dataset_urn.py @@ -4,7 +4,6 @@ from datahub.utilities.urns.data_platform_urn import DataPlatformUrn from datahub.utilities.urns.dataset_urn import DatasetUrn -from datahub.utilities.urns.error import InvalidUrnError @pytest.mark.filterwarnings("ignore::DeprecationWarning") @@ -20,22 +19,3 @@ def test_parse_urn(self) -> None: assert dataset_urn.get_env() == "PROD" assert dataset_urn.__str__() == dataset_urn_str assert dataset_urn == DatasetUrn("urn:li:dataPlatform:abc", "def", "prod") - - def test_invalid_urn(self) -> None: - with self.assertRaises(InvalidUrnError): - DatasetUrn.create_from_string( - "urn:li:abc:(urn:li:dataPlatform:abc,def,prod)" - ) - - with self.assertRaises(InvalidUrnError): - DatasetUrn.create_from_string( - "urn:li:dataset:(urn:li:user:abc,dataset,prod)" - ) - - with self.assertRaises(InvalidUrnError): - DatasetUrn.create_from_string("urn:li:dataset:(urn:li:user:abc,dataset)") - - with self.assertRaises(InvalidUrnError): - DatasetUrn.create_from_string( - "urn:li:dataset:(urn:li:user:abc,dataset,invalidEnv)" - ) diff --git a/metadata-ingestion/tests/unit/urns/test_domain_urn.py b/metadata-ingestion/tests/unit/urns/test_domain_urn.py index 843a5bf40f5c63..aa5050ce1c030e 100644 --- a/metadata-ingestion/tests/unit/urns/test_domain_urn.py +++ b/metadata-ingestion/tests/unit/urns/test_domain_urn.py @@ -3,7 +3,6 @@ import pytest from datahub.utilities.urns.domain_urn import DomainUrn -from datahub.utilities.urns.error import InvalidUrnError @pytest.mark.filterwarnings("ignore::DeprecationWarning") @@ -17,10 +16,3 @@ def test_parse_urn(self) -> None: assert str(domain_urn) == domain_urn_str assert domain_urn == DomainUrn("abc") assert domain_urn == DomainUrn.create_from_id("abc") - - def test_invalid_urn(self) -> None: - with self.assertRaises(InvalidUrnError): - DomainUrn.create_from_string("urn:li:abc:domain") - - with self.assertRaises(InvalidUrnError): - DomainUrn.create_from_string("urn:li:domain:(part1,part2)") diff --git a/metadata-ingestion/tests/unit/urns/test_notebook_urn.py b/metadata-ingestion/tests/unit/urns/test_notebook_urn.py index 3ec580f02142b7..6d4dd2ee6fa8c0 100644 --- a/metadata-ingestion/tests/unit/urns/test_notebook_urn.py +++ b/metadata-ingestion/tests/unit/urns/test_notebook_urn.py @@ -2,7 +2,6 @@ import pytest -from datahub.utilities.urns.error import InvalidUrnError from datahub.utilities.urns.notebook_urn import NotebookUrn @@ -16,12 +15,3 @@ def test_parse_urn(self) -> None: assert str(notebook_urn) == notebook_urn_str assert notebook_urn == NotebookUrn("querybook", "123") - - def test_invalid_urn(self) -> None: - with self.assertRaises(InvalidUrnError): - NotebookUrn.create_from_string( - "urn:li:abc:(urn:li:dataPlatform:abc,def,prod)" - ) - - with self.assertRaises(InvalidUrnError): - NotebookUrn.create_from_string("urn:li:notebook:(part1,part2,part3)") diff --git a/metadata-ingestion/tests/unit/urns/test_tag_urn.py b/metadata-ingestion/tests/unit/urns/test_tag_urn.py index fa3664bcc02180..5f4c9077e28294 100644 --- a/metadata-ingestion/tests/unit/urns/test_tag_urn.py +++ b/metadata-ingestion/tests/unit/urns/test_tag_urn.py @@ -2,7 +2,6 @@ import pytest -from datahub.utilities.urns.error import InvalidUrnError from datahub.utilities.urns.tag_urn import TagUrn @@ -17,10 +16,3 @@ def test_parse_urn(self) -> None: assert str(tag_urn) == tag_urn_str assert tag_urn == TagUrn("abc") assert tag_urn == TagUrn.create_from_id("abc") - - def test_invalid_urn(self) -> None: - with self.assertRaises(InvalidUrnError): - TagUrn.create_from_string("urn:li:abc:tag_id") - - with self.assertRaises(InvalidUrnError): - TagUrn.create_from_string("urn:li:tag:(part1,part2)") diff --git a/metadata-ingestion/tests/unit/urns/test_urn.py b/metadata-ingestion/tests/unit/urns/test_urn.py index 73badb3d1b4234..0c362473c0cf18 100644 --- a/metadata-ingestion/tests/unit/urns/test_urn.py +++ b/metadata-ingestion/tests/unit/urns/test_urn.py @@ -1,16 +1,17 @@ +import logging +import pathlib +from typing import List + import pytest -from datahub.metadata.urns import ( - CorpUserUrn, - DashboardUrn, - DataPlatformUrn, - DatasetUrn, - Urn, -) +from datahub.metadata.urns import CorpUserUrn, DatasetUrn, Urn from datahub.utilities.urns.error import InvalidUrnError pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning") +_CURRENT_DIR = pathlib.Path(__file__).parent +logger = logging.getLogger(__name__) + def test_parse_urn() -> None: simple_urn_str = "urn:li:dataPlatform:abc" @@ -40,38 +41,12 @@ def test_url_encode_urn() -> None: ) -def test_invalid_urn() -> None: - with pytest.raises(InvalidUrnError): - Urn.from_string("urn:li:abc") - - with pytest.raises(InvalidUrnError): - Urn.from_string("urn:li:abc:") - - with pytest.raises(InvalidUrnError): - Urn.from_string("urn:li:abc:()") - - with pytest.raises(InvalidUrnError): - Urn.from_string("urn:li:abc:(abc,)") - - with pytest.raises(InvalidUrnError): - Urn.from_string("urn:li:corpuser:abc)") - - def test_urn_colon() -> None: - # Colon characters are valid in urns, and should not mess up parsing. - - urn = Urn.from_string( - "urn:li:dashboard:(looker,dashboards.thelook::customer_lookup)" - ) - assert isinstance(urn, DashboardUrn) - - assert DataPlatformUrn.from_string("urn:li:dataPlatform:abc:def") - assert DatasetUrn.from_string( - "urn:li:dataset:(urn:li:dataPlatform:abc:def,table_name,PROD)" - ) - assert Urn.from_string("urn:li:corpuser:foo:bar@example.com") + # There's a bunch of other, simpler tests for special characters in the valid_urns test. + # This test ensures that the type dispatch and fields work fine here. # I'm not sure why you'd ever want this, but technically it's a valid urn. + urn = Urn.from_string("urn:li:corpuser::") assert isinstance(urn, CorpUserUrn) assert urn.username == ":" @@ -85,9 +60,48 @@ def test_urn_coercion() -> None: assert urn == Urn.from_string(urn.urn()) -def test_urn_type_dispatch() -> None: +def test_urn_type_dispatch_1() -> None: urn = Urn.from_string("urn:li:dataset:(urn:li:dataPlatform:abc,def,PROD)") assert isinstance(urn, DatasetUrn) with pytest.raises(InvalidUrnError, match="Passed an urn of type corpuser"): DatasetUrn.from_string("urn:li:corpuser:foo") + + +def test_urn_type_dispatch_2() -> None: + urn = "urn:li:dataJob:(urn:li:dataFlow:(airflow,flow_id,prod),job_id)" + assert Urn.from_string(urn).urn() == urn + + with pytest.raises(InvalidUrnError, match="Passed an urn of type dataJob"): + CorpUserUrn.from_string(urn) + + +def _load_urns(file_name: pathlib.Path) -> List[str]: + urns = [ + line.strip() + for line in file_name.read_text().splitlines() + if line.strip() and not line.startswith("#") + ] + assert len(urns) > 0, f"No urns found in {file_name}" + return urns + + +def test_valid_urns() -> None: + valid_urns_file = _CURRENT_DIR / "valid_urns.txt" + valid_urns = _load_urns(valid_urns_file) + + for valid_urn in valid_urns: + logger.info(f"Testing valid URN: {valid_urn}") + parsed_urn = Urn.from_string(valid_urn) + assert parsed_urn.urn() == valid_urn + + +def test_invalid_urns() -> None: + invalid_urns_file = _CURRENT_DIR / "invalid_urns.txt" + invalid_urns = _load_urns(invalid_urns_file) + + # Test each invalid URN + for invalid_urn in invalid_urns: + with pytest.raises(InvalidUrnError): + logger.info(f"Testing invalid URN: {invalid_urn}") + Urn.from_string(invalid_urn) diff --git a/metadata-ingestion/tests/unit/urns/valid_urns.txt b/metadata-ingestion/tests/unit/urns/valid_urns.txt new file mode 100644 index 00000000000000..23205ec9a7235b --- /dev/null +++ b/metadata-ingestion/tests/unit/urns/valid_urns.txt @@ -0,0 +1,24 @@ +# Unknown entity types become generic urns +urn:li:abc:foo +urn:li:abc:(foo,bar) +urn:li:abc:(urn:li:dataPlatform:abc,def,prod) + +# A bunch of pretty normal urns +urn:li:corpuser:foo +urn:li:corpGroup:bar +urn:li:dataset:(urn:li:dataPlatform:abc,def/ghi,prod) +urn:li:dataFlow:(airflow,def,prod) +urn:li:dataJob:(urn:li:dataFlow:(airflow,flow_id,prod),job_id) +urn:li:tag:abc +urn:li:chart:(looker,chart_name) +urn:li:dashboard:(looker,dashboard_name) +urn:li:dataProcessInstance:abc +urn:li:domain:abc +urn:li:notebook:(querybook,123) + +# Urns with colons and other special characters +urn:li:tag:dbt:bar +urn:li:tag:: +urn:li:dashboard:(looker,dashboards.thelook::customer_lookup) +urn:li:dataPlatform:abc:def +urn:li:corpuser:foo:bar@example.com