From c417c01ae08f2dc44b31bc44a6ff87f7e8a6ae53 Mon Sep 17 00:00:00 2001 From: sagar-salvi-apptware Date: Fri, 21 Jun 2024 03:58:33 +0530 Subject: [PATCH] fix: implement tags-to-terms transformation based on tags --- .../docs/transformer/dataset_transformer.md | 52 ++++- metadata-ingestion/setup.py | 1 + .../transformer/dataset_transformer.py | 15 ++ .../ingestion/transformer/tags_to_terms.py | 145 ++++++++++++ .../tests/unit/test_transform_dataset.py | 218 +++++++++++++++++- 5 files changed, 428 insertions(+), 3 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py diff --git a/metadata-ingestion/docs/transformer/dataset_transformer.md b/metadata-ingestion/docs/transformer/dataset_transformer.md index 772a638b6a9487..8505815a93963f 100644 --- a/metadata-ingestion/docs/transformer/dataset_transformer.md +++ b/metadata-ingestion/docs/transformer/dataset_transformer.md @@ -10,7 +10,7 @@ The below table shows transformer which can transform aspects of entity [Dataset | `ownership` | - [Simple Add Dataset ownership](#simple-add-dataset-ownership)
- [Pattern Add Dataset ownership](#pattern-add-dataset-ownership)
- [Simple Remove Dataset Ownership](#simple-remove-dataset-ownership)
- [Extract Ownership from Tags](#extract-ownership-from-tags)
- [Clean suffix prefix from Ownership](#clean-suffix-prefix-from-ownership) | | `globalTags` | - [Simple Add Dataset globalTags ](#simple-add-dataset-globaltags)
- [Pattern Add Dataset globalTags](#pattern-add-dataset-globaltags)
- [Add Dataset globalTags](#add-dataset-globaltags) | | `browsePaths` | - [Set Dataset browsePath](#set-dataset-browsepath) | -| `glossaryTerms` | - [Simple Add Dataset glossaryTerms ](#simple-add-dataset-glossaryterms)
- [Pattern Add Dataset glossaryTerms](#pattern-add-dataset-glossaryterms) | +| `glossaryTerms` | - [Simple Add Dataset glossaryTerms ](#simple-add-dataset-glossaryterms)
- [Pattern Add Dataset glossaryTerms](#pattern-add-dataset-glossaryterms)
- [Tags to Term Mapping](#tags-to-term-mapping) | | `schemaMetadata` | - [Pattern Add Dataset Schema Field glossaryTerms](#pattern-add-dataset-schema-field-glossaryterms)
- [Pattern Add Dataset Schema Field globalTags](#pattern-add-dataset-schema-field-globaltags) | | `datasetProperties` | - [Simple Add Dataset datasetProperties](#simple-add-dataset-datasetproperties)
- [Add Dataset datasetProperties](#add-dataset-datasetproperties) | | `domains` | - [Simple Add Dataset domains](#simple-add-dataset-domains)
- [Pattern Add Dataset domains](#pattern-add-dataset-domains)
- [Domain Mapping Based on Tags](#domain-mapping-based-on-tags) | @@ -668,6 +668,56 @@ We can add glossary terms to datasets based on a regex filter. ".*example1.*": ["urn:li:glossaryTerm:Email", "urn:li:glossaryTerm:Address"] ".*example2.*": ["urn:li:glossaryTerm:PostalCode"] ``` + +## Tags to Term Mapping +### Config Details + +| Field | Required | Type | Default | Description | +|---------------|----------|--------------------|-------------|-------------------------------------------------------------------------------------------------------| +| `tags` | ✅ | List[str] | | List of tag names based on which terms will be created and associated with the dataset. | +| `semantics` | | enum | "OVERWRITE" | Determines whether to OVERWRITE or PATCH the terms associated with the dataset on DataHub GMS. | + +
+ +The `tags_to_term` transformer is designed to map specific tags to glossary terms within DataHub. It takes a configuration of tags should be translated into corresponding glossaryTerm. This transformer can apply these mappings to any tags found either at column level of dataset or dataset top level. + +When specifying tags in the configuration, use the tag's simple name rather than the full tag URN. + +For example, instead of using the tag URN `urn:li:tag:snowflakedb.snowflakeschema.tag_name:tag_value`, you should specify just the tag name `tag_name` in the mapping configuration + +```yaml +transformers: + - type: "tags_to_term" + config: + semantics: OVERWRITE # OVERWRITE is the default behavior + tags: + - "tag_name" +``` + +`tags_to_term` can be configured in below different way + +- Add domains based on tags, however overwrite the domains available for the dataset on DataHub GMS +```yaml + transformers: + - type: "domain_mapping_based_on_tags" + config: + semantics: OVERWRITE # OVERWRITE is default behaviour + tags: + - "example1" + - "example2" + - "example3" + ``` +- Add domains based on tags, however keep the domains available for the dataset on DataHub GMS +```yaml + transformers: + - type: "domain_mapping_based_on_tags" + config: + semantics: PATCH + domain_mapping: + 'example1': "urn:li:domain:engineering" + 'example2': "urn:li:domain:hr" + ``` + ## Pattern Add Dataset Schema Field glossaryTerms ### Config Details | Field | Required | Type | Default | Description | diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 04099e0a24b9ff..69a49e85a15fd8 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -715,6 +715,7 @@ "replace_external_url = datahub.ingestion.transformer.replace_external_url:ReplaceExternalUrl", "pattern_cleanup_dataset_usage_user = datahub.ingestion.transformer.pattern_cleanup_dataset_usage_user:PatternCleanupDatasetUsageUser", "domain_mapping_based_on_tags = datahub.ingestion.transformer.dataset_domain_based_on_tags:DatasetTagDomainMapper", + "tags_to_term = datahub.ingestion.transformer.tags_to_terms:TagsToTermMapper", ], "datahub.ingestion.sink.plugins": [ "file = datahub.ingestion.sink.file:FileSink", diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/dataset_transformer.py b/metadata-ingestion/src/datahub/ingestion/transformer/dataset_transformer.py index a78a79141e8e42..3e313ddd356be7 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/dataset_transformer.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/dataset_transformer.py @@ -27,6 +27,16 @@ def entity_types(self) -> List[str]: return ["dataset"] +class TagTransformer(BaseTransformer, SingleAspectTransformer, metaclass=ABCMeta): + """Transformer that does transform sequentially on each tag.""" + + def __init__(self): + super().__init__() + + def entity_types(self) -> List[str]: + return ["dataset", "container"] + + class DatasetOwnershipTransformer(DatasetTransformer, metaclass=ABCMeta): def aspect_name(self) -> str: return "ownership" @@ -128,3 +138,8 @@ def aspect_name(self) -> str: class DatasetUsageStatisticsTransformer(DatasetTransformer, metaclass=ABCMeta): def aspect_name(self) -> str: return "datasetUsageStatistics" + + +class TagsToTermTransformer(TagTransformer, metaclass=ABCMeta): + def aspect_name(self) -> str: + return "glossaryTerms" diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py b/metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py new file mode 100644 index 00000000000000..67c9f784844b57 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py @@ -0,0 +1,145 @@ +from typing import List, Optional, Set, cast + +import datahub.emitter.mce_builder as builder +from datahub.configuration.common import ( + TransformerSemantics, + TransformerSemanticsConfigModel, +) +from datahub.emitter.mce_builder import Aspect, make_term_urn +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.graph.client import DataHubGraph +from datahub.ingestion.transformer.dataset_transformer import TagsToTermTransformer +from datahub.metadata.schema_classes import ( + AuditStampClass, + GlobalTagsClass, + GlossaryTermAssociationClass, + GlossaryTermsClass, + SchemaMetadataClass, +) + + +class TagsToTermMapperConfig(TransformerSemanticsConfigModel): + tags: List[str] + + +class TagsToTermMapper(TagsToTermTransformer): + """This transformer maps specified tags to corresponding glossary terms for a dataset.""" + + def __init__(self, config: TagsToTermMapperConfig, ctx: PipelineContext): + super().__init__() + self.ctx: PipelineContext = ctx + self.config: TagsToTermMapperConfig = config + + @classmethod + def create(cls, config_dict: dict, ctx: PipelineContext) -> "TagsToTermMapper": + config = TagsToTermMapperConfig.parse_obj(config_dict) + return cls(config, ctx) + + @staticmethod + def _merge_with_server_glossary_terms( + graph: DataHubGraph, + urn: str, + glossary_terms_aspect: Optional[GlossaryTermsClass], + ) -> Optional[GlossaryTermsClass]: + if not glossary_terms_aspect or not glossary_terms_aspect.terms: + # nothing to add, no need to consult server + return None + + # Merge the transformed terms with existing server terms. + # The transformed terms takes precedence, which may change the term context. + server_glossary_terms_aspect = graph.get_glossary_terms(entity_urn=urn) + if server_glossary_terms_aspect is not None: + glossary_terms_aspect.terms = list( + { + **{term.urn: term for term in server_glossary_terms_aspect.terms}, + **{term.urn: term for term in glossary_terms_aspect.terms}, + }.values() + ) + + return glossary_terms_aspect + + @staticmethod + def get_tags_from_global_tags(global_tags: Optional[GlobalTagsClass]) -> Set[str]: + """Extracts tags urn from GlobalTagsClass.""" + if not global_tags or not global_tags.tags: + return set() + + return {tag_assoc.tag for tag_assoc in global_tags.tags} + + @staticmethod + def get_tags_from_schema_metadata( + schema_metadata: Optional[SchemaMetadataClass], + ) -> Set[str]: + """Extracts globalTags from all fields in SchemaMetadataClass.""" + if not schema_metadata or not schema_metadata.fields: + return set() + tags = set() + for field in schema_metadata.fields: + if field.globalTags: + tags.update( + TagsToTermMapper.get_tags_from_global_tags(field.globalTags) + ) + return tags + + def transform_aspect( + self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect] + ) -> Optional[Aspect]: + + in_glossary_terms: Optional[GlossaryTermsClass] = cast( + Optional[GlossaryTermsClass], aspect + ) + + assert self.ctx.graph + in_global_tags_aspect: Optional[GlobalTagsClass] = self.ctx.graph.get_tags( + entity_urn + ) + in_schema_metadata_aspect: Optional[ + SchemaMetadataClass + ] = self.ctx.graph.get_schema_metadata(entity_urn) + + if in_global_tags_aspect is None and in_schema_metadata_aspect is None: + return cast(Aspect, in_glossary_terms) + + global_tags = TagsToTermMapper.get_tags_from_global_tags(in_global_tags_aspect) + schema_metadata_tags = TagsToTermMapper.get_tags_from_schema_metadata( + in_schema_metadata_aspect + ) + + # Combine tags from both global and schema level + combined_tags = global_tags.union(schema_metadata_tags) + + tag_set = set(self.config.tags) + terms_to_add = set() + tags_to_delete = set() + + # Check each global tag against the configured tag list and prepare terms + for full_tag in combined_tags: + tag_name = full_tag.split("urn:li:tag:")[-1].split(".")[-1].split(":")[0] + if tag_name in tag_set: + term_urn = make_term_urn(tag_name) + terms_to_add.add(term_urn) + tags_to_delete.add(full_tag) # Full URN for deletion + + if not terms_to_add: + return cast(Aspect, in_glossary_terms) # No new terms to add + + for tag_urn in tags_to_delete: + self.ctx.graph.hard_delete_entity(urn=tag_urn) + + # Initialize the Glossary Terms properly + out_glossary_terms = GlossaryTermsClass( + terms=[GlossaryTermAssociationClass(urn=term) for term in terms_to_add], + auditStamp=AuditStampClass( + time=builder.get_sys_time(), actor="urn:li:corpUser:restEmitter" + ), + ) + + if self.config.semantics == TransformerSemantics.PATCH: + patch_glossary_terms: Optional[ + GlossaryTermsClass + ] = TagsToTermMapper._merge_with_server_glossary_terms( + self.ctx.graph, entity_urn, out_glossary_terms + ) + return cast(Optional[Aspect], patch_glossary_terms) + else: + return cast(Aspect, out_glossary_terms) diff --git a/metadata-ingestion/tests/unit/test_transform_dataset.py b/metadata-ingestion/tests/unit/test_transform_dataset.py index a0deae972badb4..4dbc8061d95ebb 100644 --- a/metadata-ingestion/tests/unit/test_transform_dataset.py +++ b/metadata-ingestion/tests/unit/test_transform_dataset.py @@ -70,7 +70,10 @@ from datahub.ingestion.transformer.dataset_domain_based_on_tags import ( DatasetTagDomainMapper, ) -from datahub.ingestion.transformer.dataset_transformer import DatasetTransformer +from datahub.ingestion.transformer.dataset_transformer import ( + DatasetTransformer, + TagTransformer, +) from datahub.ingestion.transformer.extract_dataset_tags import ExtractDatasetTags from datahub.ingestion.transformer.extract_ownership_from_tags import ( ExtractOwnersFromTagsTransformer, @@ -86,6 +89,7 @@ SimpleRemoveDatasetOwnership, ) from datahub.ingestion.transformer.replace_external_url import ReplaceExternalUrl +from datahub.ingestion.transformer.tags_to_terms import TagsToTermMapper from datahub.metadata.schema_classes import ( BrowsePathsClass, DatasetPropertiesClass, @@ -1891,7 +1895,7 @@ def test_pattern_dataset_schema_tags_transformation(mock_time): def run_dataset_transformer_pipeline( - transformer_type: Type[DatasetTransformer], + transformer_type: Type[Union[DatasetTransformer, TagTransformer]], aspect: Optional[builder.Aspect], config: dict, pipeline_context: PipelineContext = PipelineContext(run_id="transformer_pipe_line"), @@ -3651,3 +3655,213 @@ def fake_get_tags(entity_urn: str) -> Optional[models.GlobalTagsClass]: assert len(transformed_aspect.domains) == 1 assert acryl_domain in transformed_aspect.domains assert server_domain not in transformed_aspect.domains + + +def test_tags_to_terms_transformation(mock_datahub_graph): + # Create domain URNs for the test + term_urn_example1 = builder.make_term_urn("example1") + term_urn_example2 = builder.make_term_urn("example2") + + def fake_get_tags(entity_urn: str) -> models.GlobalTagsClass: + return models.GlobalTagsClass( + tags=[ + TagAssociationClass(tag=builder.make_tag_urn("exmaple1")), + TagAssociationClass(tag=builder.make_tag_urn("exmaple2")), + ] + ) + + # fake the server response + def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass: + return models.SchemaMetadataClass( + schemaName="customer", # not used + platform=builder.make_data_platform_urn( + "hive" + ), # important <- platform must be an urn + version=0, + # when the source system has a notion of versioning of schemas, insert this in, otherwise leave as 0 + hash="", + # when the source system has a notion of unique schemas identified via hash, include a hash, else leave it as empty string + platformSchema=models.OtherSchemaClass( + rawSchema="__insert raw schema here__" + ), + fields=[ + models.SchemaFieldClass( + fieldPath="first_name", + globalTags=models.GlobalTagsClass( + tags=[ + models.TagAssociationClass( + tag=builder.make_tag_urn("example2") + ) + ], + ), + glossaryTerms=models.GlossaryTermsClass( + terms=[ + models.GlossaryTermAssociationClass( + urn=builder.make_term_urn("pii") + ) + ], + auditStamp=models.AuditStampClass._construct_with_defaults(), + ), + type=models.SchemaFieldDataTypeClass(type=models.StringTypeClass()), + nativeDataType="VARCHAR(100)", + # use this to provide the type of the field in the source system's vernacular + ), + models.SchemaFieldClass( + fieldPath="mobile_number", + glossaryTerms=models.GlossaryTermsClass( + terms=[ + models.GlossaryTermAssociationClass( + urn=builder.make_term_urn("pii") + ) + ], + auditStamp=models.AuditStampClass._construct_with_defaults(), + ), + type=models.SchemaFieldDataTypeClass(type=models.StringTypeClass()), + nativeDataType="VARCHAR(100)", + # use this to provide the type of the field in the source system's vernacular + ), + ], + ) + + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph.get_tags = fake_get_tags # type: ignore + pipeline_context.graph.get_schema_metadata = fake_schema_metadata # type: ignore + + # Configuring the transformer + config = {"tags": ["exmaple1", "exmaple2"]} + + # Running the transformer within a test pipeline + output = run_dataset_transformer_pipeline( + transformer_type=TagsToTermMapper, + aspect=models.GlossaryTermsClass( + terms=[ + models.GlossaryTermAssociationClass(urn=builder.make_term_urn("pii")) + ], + auditStamp=models.AuditStampClass._construct_with_defaults(), + ), + config=config, + pipeline_context=pipeline_context, + ) + + # Expected results + expected_terms = [term_urn_example2, term_urn_example1] + + # Verify the output + assert len(output) == 2 # One for result and one for end of stream + terms_aspect = output[0].record.aspect + assert isinstance(terms_aspect, models.GlossaryTermsClass) + assert len(terms_aspect.terms) == len(expected_terms) + assert set(term.urn for term in terms_aspect.terms) == { + "urn:li:glossaryTerm:exmaple1", + "urn:li:glossaryTerm:exmaple2", + } + + +def test_tags_to_terms_with_no_matching_terms(mock_datahub_graph): + # Setup for test where no tags match the provided term mappings + def fake_get_tags_no_match(entity_urn: str) -> models.GlobalTagsClass: + return models.GlobalTagsClass( + tags=[ + TagAssociationClass(tag=builder.make_tag_urn("nonMatchingTag1")), + TagAssociationClass(tag=builder.make_tag_urn("nonMatchingTag2")), + ] + ) + + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph.get_tags = fake_get_tags_no_match # type: ignore + + # No matching terms in config + config = {"tags": ["example1", "example2"]} + + # Running the transformer within a test pipeline + output = run_dataset_transformer_pipeline( + transformer_type=TagsToTermMapper, + aspect=models.GlossaryTermsClass( + terms=[ + models.GlossaryTermAssociationClass(urn=builder.make_term_urn("pii")) + ], + auditStamp=models.AuditStampClass._construct_with_defaults(), + ), + config=config, + pipeline_context=pipeline_context, + ) + + # Verify the output + assert len(output) == 2 # One for result and one for end of stream + terms_aspect = output[0].record.aspect + assert isinstance(terms_aspect, models.GlossaryTermsClass) + assert len(terms_aspect.terms) == 1 + + +def test_tags_to_terms_with_missing_tags(mock_datahub_graph): + # Setup for test where no tags are present + def fake_get_no_tags(entity_urn: str) -> models.GlobalTagsClass: + return models.GlobalTagsClass(tags=[]) + + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph.get_tags = fake_get_no_tags # type: ignore + + config = {"tags": ["example1", "example2"]} + + # Running the transformer with no tags + output = run_dataset_transformer_pipeline( + transformer_type=TagsToTermMapper, + aspect=models.GlossaryTermsClass( + terms=[ + models.GlossaryTermAssociationClass(urn=builder.make_term_urn("pii")) + ], + auditStamp=models.AuditStampClass._construct_with_defaults(), + ), + config=config, + pipeline_context=pipeline_context, + ) + + # Verify that no terms are added when there are no tags + assert len(output) == 2 + terms_aspect = output[0].record.aspect + assert isinstance(terms_aspect, models.GlossaryTermsClass) + assert len(terms_aspect.terms) == 1 + + +def test_tags_to_terms_with_partial_match(mock_datahub_graph): + # Setup for partial match scenario + def fake_get_partial_match_tags(entity_urn: str) -> models.GlobalTagsClass: + return models.GlobalTagsClass( + tags=[ + TagAssociationClass( + tag=builder.make_tag_urn("example1") + ), # Should match + TagAssociationClass( + tag=builder.make_tag_urn("nonMatchingTag") + ), # No match + ] + ) + + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph.get_tags = fake_get_partial_match_tags # type: ignore + + config = {"tags": ["example1"]} # Only 'example1' has a term mapped + + # Running the transformer with partial matching tags + output = run_dataset_transformer_pipeline( + transformer_type=TagsToTermMapper, + aspect=models.GlossaryTermsClass( + terms=[ + models.GlossaryTermAssociationClass(urn=builder.make_term_urn("pii")) + ], + auditStamp=models.AuditStampClass._construct_with_defaults(), + ), + config=config, + pipeline_context=pipeline_context, + ) + + # Verify that only matched term is added + assert len(output) == 2 + terms_aspect = output[0].record.aspect + assert isinstance(terms_aspect, models.GlossaryTermsClass) + assert len(terms_aspect.terms) == 1 + assert terms_aspect.terms[0].urn == "urn:li:glossaryTerm:example1"