diff --git a/metadata-ingestion/docs/transformer/dataset_transformer.md b/metadata-ingestion/docs/transformer/dataset_transformer.md index 0acc134d4ef00b..5421a932daccee 100644 --- a/metadata-ingestion/docs/transformer/dataset_transformer.md +++ b/metadata-ingestion/docs/transformer/dataset_transformer.md @@ -22,7 +22,7 @@ The below table shows transformer which can transform aspects of entity [Dataset |-----------------------------|----------|---------|---------------|---------------------------------------------| | `tag_pattern` | | str | | Regex to use for tags to match against. Supports Regex to match a pattern which is used to remove content. Rest of string is considered owner ID for creating owner URN. | | `is_user` | | bool | `true` | Whether should be consider a user or not. If `false` then considered a group. | -| `owner_character_mapping` | | dict[str, str] | | A mapping of extracted owner character to datahub owner character. | +| `tag_character_mapping` | | dict[str, str] | | A mapping of tag character to datahub owner character. If provided, `tag_pattern` config should be matched against converted tag as per mapping| | `email_domain` | | str | | If set then this is appended to create owner URN. | | `extract_owner_type_from_tag_pattern` | | str | `false` | Whether to extract an owner type from provided tag pattern first group. If `true`, no need to provide owner_type and owner_type_urn config. For example: if provided tag pattern is `(.*)_owner_email:` and actual tag is `developer_owner_email`, then extracted owner type will be `developer`.| | `owner_type` | | str | `TECHNICAL_OWNER` | Ownership type. | @@ -40,14 +40,14 @@ transformers: ``` So if we have input dataset tag like -- `urn:li:tag:dataset_owner_email:abc@email.com` -- `urn:li:tag:dataset_owner_email:xyz@email.com` +- `urn:li:tag:owner_email:abc@email.com` +- `urn:li:tag:owner_email:xyz@email.com` The portion of the tag after the matched tag pattern will be converted into an owner. Hence users `abc@email.com` and `xyz@email.com` will be added as owners. ### Examples -- Add owners, however owner should be considered as group and also email domain not provided in tag string. For example: from tag urn `urn:li:tag:dataset_owner:abc` extracted owner urn should be `urn:li:corpGroup:abc@email.com` then config would look like this: +- Add owners, however owner should be considered as group and also email domain not provided in tag string. For example: from tag urn `urn:li:tag:owner:abc` extracted owner urn should be `urn:li:corpGroup:abc@email.com` then config would look like this: ```yaml transformers: - type: "extract_ownership_from_tags" @@ -56,7 +56,7 @@ The portion of the tag after the matched tag pattern will be converted into an o is_user: false email_domain: "email.com" ``` -- Add owners, however owner type and owner type urn wanted to provide externally. For example: from tag urn `urn:li:tag:dataset_owner_email:abc@email.com` owner type should be `CUSTOM` and owner type urn as `"urn:li:ownershipType:data_product"` then config would look like this: +- Add owners, however owner type and owner type urn wanted to provide externally. For example: from tag urn `urn:li:tag:owner_email:abc@email.com` owner type should be `CUSTOM` and owner type urn as `"urn:li:ownershipType:data_product"` then config would look like this: ```yaml transformers: - type: "extract_ownership_from_tags" @@ -65,15 +65,17 @@ The portion of the tag after the matched tag pattern will be converted into an o owner_type: "CUSTOM" owner_type_urn: "urn:li:ownershipType:data_product" ``` -- Add owners, however some owner characters needs to replace with some other characters before ingestion. For example: from tag urn `urn:li:tag:dataset_owner_email:abc_xyz-email_com` extracted owner urn should be `urn:li:corpGroup:abc.xyz@email.com` then config would look like this: +- Add owners, however some tag characters needs to replace with some other characters before extracting owner. For example: from tag urn `urn:li:tag:owner__email:abc--xyz-email_com` extracted owner urn should be `urn:li:corpGroup:abc.xyz@email.com` then config would look like this: ```yaml transformers: - type: "extract_ownership_from_tags" config: tag_pattern: "owner_email:" - owner_character_mapping: - "_": ".", - "-": "@", + tag_character_mapping: + "_": "." + "-": "@" + "--": "-" + "__": "_" ``` - Add owners, however owner type also need to extracted from tag pattern. For example: from tag urn `urn:li:tag:data_producer_owner_email:abc@email.com` extracted owner type should be `data_producer` then config would look like this: ```yaml diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py b/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py index e509b4b719166b..27311ff998cbf9 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py @@ -26,7 +26,7 @@ class ExtractOwnersFromTagsConfig(ConfigModel): tag_pattern: str = "" is_user: bool = True - owner_character_mapping: Optional[Dict[str, str]] = None + tag_character_mapping: Optional[Dict[str, str]] = None email_domain: Optional[str] = None extract_owner_type_from_tag_pattern: bool = False owner_type: str = "TECHNICAL_OWNER" @@ -70,18 +70,35 @@ def get_owner_urn(self, owner_str: str) -> str: return owner_str + "@" + self.config.email_domain return owner_str - def convert_owner_as_per_mapping(self, owner: str) -> str: - if self.config.owner_character_mapping: - # Sort the provided mapping by its length. - # Eg: Suppose we have {"_":".", "__":"#"} character mapping. - # In this case "__" character should get replace first compare to "_" character. - for key in sorted( - self.config.owner_character_mapping.keys(), + def convert_tag_as_per_mapping(self, tag: str) -> str: + """ + Function to modify tag as per provided tag character mapping. It also handles the overlappings in the mapping. + Eg: '--':'-' & '-':'@' should not cause incorrect mapping. + """ + if self.config.tag_character_mapping: + # indices list to keep track of the indices where replacements have been made + indices: List[int] = list() + for old_char in sorted( + self.config.tag_character_mapping.keys(), key=len, reverse=True, ): - owner = owner.replace(key, self.config.owner_character_mapping[key]) - return owner + new_char = self.config.tag_character_mapping[old_char] + index = tag.find(old_char) + while index != -1: + if index not in indices: + tag = tag[:index] + new_char + tag[index + len(old_char) :] + # Adjust indices for overlapping replacements + indices = [ + each + (len(new_char) - len(old_char)) + if each > index + else each + for each in indices + ] + indices.append(index) + # Find the next occurrence of old_char, starting from the next index + index = tag.find(old_char, index + len(new_char)) + return tag def handle_end_of_stream( self, @@ -100,10 +117,10 @@ def transform_aspect( for tag_class in tags: tag_str = TagUrn.from_string(tag_class.tag).name + tag_str = self.convert_tag_as_per_mapping(tag_str) re_match = re.search(self.config.tag_pattern, tag_str) if re_match: owner_str = tag_str[re_match.end() :].strip() - owner_str = self.convert_owner_as_per_mapping(owner_str) owner_urn_str = self.get_owner_urn(owner_str) owner_urn = ( str(CorpuserUrn(owner_urn_str)) diff --git a/metadata-ingestion/tests/unit/test_transform_dataset.py b/metadata-ingestion/tests/unit/test_transform_dataset.py index c31ec12abfbd71..3782eb0e275f31 100644 --- a/metadata-ingestion/tests/unit/test_transform_dataset.py +++ b/metadata-ingestion/tests/unit/test_transform_dataset.py @@ -742,20 +742,18 @@ def _test_owner( expected_owner_type_urn="urn:li:ownershipType:ad8557d6-dcb9-4d2a-83fc-b7d0d54f3e0f", ) _test_owner( - tag="data_producer_owner_email:abc_xyz-email_com", + tag="data__producer__owner__email:abc--xyz-email_com", config={ "tag_pattern": "(.*)_owner_email:", - "owner_character_mapping": { + "tag_character_mapping": { "_": ".", "-": "@", "__": "_", "--": "-", - "_-": "#", - "-_": " ", }, "extract_owner_type_from_tag_pattern": True, }, - expected_owner="urn:li:corpuser:abc.xyz@email.com", + expected_owner="urn:li:corpuser:abc-xyz@email.com", expected_owner_type=OwnershipTypeClass.CUSTOM, expected_owner_type_urn="urn:li:ownershipType:data_producer", )