From bc7f6f8d02e7c6ed2d08b4e417bbf7f80e5703c4 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 27 Nov 2024 12:49:15 -0500 Subject: [PATCH 1/3] feat(ingest): add tests for colon characters in urns --- metadata-ingestion/tests/unit/urns/test_urn.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/tests/unit/urns/test_urn.py b/metadata-ingestion/tests/unit/urns/test_urn.py index 1bf48082fec8c..e0cc3b96b62f1 100644 --- a/metadata-ingestion/tests/unit/urns/test_urn.py +++ b/metadata-ingestion/tests/unit/urns/test_urn.py @@ -1,6 +1,6 @@ import pytest -from datahub.metadata.urns import DatasetUrn, Urn +from datahub.metadata.urns import DashboardUrn, DataPlatformUrn, DatasetUrn, Urn from datahub.utilities.urns.error import InvalidUrnError pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning") @@ -48,6 +48,21 @@ def test_invalid_urn() -> None: Urn.create_from_string("urn:li:abc:(abc,)") +def test_urn_colon() -> None: + # Colon characters are valid in urns, and should not mess up parsing. + + urn = Urn.from_string( + "urn:li:dashboard:(looker,dashboards.thelook::customer_lookup)" + ) + assert isinstance(urn, DashboardUrn) + + assert DataPlatformUrn.from_string("urn:li:dataPlatform:abc:def") + assert DatasetUrn.from_string( + "urn:li:dataset:(urn:li:dataPlatform:abc:def,table_name,prod)" + ) + assert Urn.from_string("urn:li:corpuser:foo:bar@example.com") + + def test_urn_type_dispatch() -> None: urn = Urn.from_string("urn:li:dataset:(urn:li:dataPlatform:abc,def,prod)") assert isinstance(urn, DatasetUrn) From df4aa11cb7f7ba1489bfe7068722bb8a3ce80d1a Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 27 Nov 2024 14:16:32 -0500 Subject: [PATCH 2/3] fix tests for env casing --- metadata-ingestion/tests/unit/urns/test_urn.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/tests/unit/urns/test_urn.py b/metadata-ingestion/tests/unit/urns/test_urn.py index e0cc3b96b62f1..03e0a84c91b87 100644 --- a/metadata-ingestion/tests/unit/urns/test_urn.py +++ b/metadata-ingestion/tests/unit/urns/test_urn.py @@ -1,6 +1,12 @@ import pytest -from datahub.metadata.urns import DashboardUrn, DataPlatformUrn, DatasetUrn, Urn +from datahub.metadata.urns import ( + CorpUserUrn, + DashboardUrn, + DataPlatformUrn, + DatasetUrn, + Urn, +) from datahub.utilities.urns.error import InvalidUrnError pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning") @@ -58,13 +64,19 @@ def test_urn_colon() -> None: assert DataPlatformUrn.from_string("urn:li:dataPlatform:abc:def") assert DatasetUrn.from_string( - "urn:li:dataset:(urn:li:dataPlatform:abc:def,table_name,prod)" + "urn:li:dataset:(urn:li:dataPlatform:abc:def,table_name,PROD)" ) assert Urn.from_string("urn:li:corpuser:foo:bar@example.com") + # I'm not sure why you'd ever want this, but technically it's a valid urn. + urn = Urn.from_string("urn:li:corpuser::") + assert isinstance(urn, CorpUserUrn) + assert urn.username == ":" + assert urn == CorpUserUrn(":") + def test_urn_type_dispatch() -> None: - urn = Urn.from_string("urn:li:dataset:(urn:li:dataPlatform:abc,def,prod)") + urn = Urn.from_string("urn:li:dataset:(urn:li:dataPlatform:abc,def,PROD)") assert isinstance(urn, DatasetUrn) with pytest.raises(InvalidUrnError, match="Passed an urn of type corpuser"): From 0b7a91cdc1220e0b6494ea3e76769820e6ec8786 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 27 Nov 2024 15:14:54 -0500 Subject: [PATCH 3/3] update tests --- .../src/datahub/utilities/urn_encoder.py | 3 ++- metadata-ingestion/tests/unit/urns/test_urn.py | 18 ++++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/utilities/urn_encoder.py b/metadata-ingestion/src/datahub/utilities/urn_encoder.py index 88c0a128b8e46..4f19eeff3e70f 100644 --- a/metadata-ingestion/src/datahub/utilities/urn_encoder.py +++ b/metadata-ingestion/src/datahub/utilities/urn_encoder.py @@ -4,7 +4,8 @@ # NOTE: Frontend relies on encoding these three characters. Specifically, we decode and encode schema fields for column level lineage. # If this changes, make appropriate changes to datahub-web-react/src/app/lineage/utils/columnLineageUtils.ts # We also rely on encoding these exact three characters when generating schemaField urns in our graphQL layer. Update SchemaFieldUtils if this changes. -RESERVED_CHARS = {",", "(", ")"} +# Also see https://datahubproject.io/docs/what/urn/#restrictions +RESERVED_CHARS = {",", "(", ")", "␟"} RESERVED_CHARS_EXTENDED = RESERVED_CHARS.union({"%"}) diff --git a/metadata-ingestion/tests/unit/urns/test_urn.py b/metadata-ingestion/tests/unit/urns/test_urn.py index 03e0a84c91b87..73badb3d1b423 100644 --- a/metadata-ingestion/tests/unit/urns/test_urn.py +++ b/metadata-ingestion/tests/unit/urns/test_urn.py @@ -42,16 +42,19 @@ def test_url_encode_urn() -> None: def test_invalid_urn() -> None: with pytest.raises(InvalidUrnError): - Urn.create_from_string("urn:li:abc") + Urn.from_string("urn:li:abc") with pytest.raises(InvalidUrnError): - Urn.create_from_string("urn:li:abc:") + Urn.from_string("urn:li:abc:") with pytest.raises(InvalidUrnError): - Urn.create_from_string("urn:li:abc:()") + Urn.from_string("urn:li:abc:()") with pytest.raises(InvalidUrnError): - Urn.create_from_string("urn:li:abc:(abc,)") + Urn.from_string("urn:li:abc:(abc,)") + + with pytest.raises(InvalidUrnError): + Urn.from_string("urn:li:corpuser:abc)") def test_urn_colon() -> None: @@ -75,6 +78,13 @@ def test_urn_colon() -> None: assert urn == CorpUserUrn(":") +def test_urn_coercion() -> None: + urn = CorpUserUrn("foo␟bar") + assert urn.urn() == "urn:li:corpuser:foo%E2%90%9Fbar" + + assert urn == Urn.from_string(urn.urn()) + + def test_urn_type_dispatch() -> None: urn = Urn.from_string("urn:li:dataset:(urn:li:dataPlatform:abc,def,PROD)") assert isinstance(urn, DatasetUrn)