From cd7f751f2cffb4a8147a5f1d4f8a2aa0891446c2 Mon Sep 17 00:00:00 2001 From: Sergio Gomez Villamor Date: Tue, 16 Apr 2024 19:23:43 +0200 Subject: [PATCH 1/7] feat(ingestion/glue): delta schemas --- .../src/datahub/ingestion/source/aws/glue.py | 83 +- .../ingestion/source/delta_lake/source.py | 39 +- .../src/datahub/utilities/delta.py | 34 + .../datahub/utilities/hive_schema_to_avro.py | 1 + .../unit/glue/glue_delta_mces_golden.json | 1548 +++++++++++++++++ .../glue_malformed_delta_mces_golden.json | 128 ++ .../tests/unit/test_glue_source.py | 86 +- .../tests/unit/test_glue_source_stubs.py | 78 + 8 files changed, 1957 insertions(+), 40 deletions(-) create mode 100644 metadata-ingestion/src/datahub/utilities/delta.py create mode 100644 metadata-ingestion/tests/unit/glue/glue_delta_mces_golden.json create mode 100644 metadata-ingestion/tests/unit/glue/glue_malformed_delta_mces_golden.json diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index 0ac13b256eb03..50741e056cbf2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -1,3 +1,4 @@ +import json import logging from collections import defaultdict from dataclasses import dataclass, field as dataclass_field @@ -98,6 +99,7 @@ UpstreamClass, UpstreamLineageClass, ) +from datahub.utilities.delta import delta_type_to_hive_type from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column logger = logging.getLogger(__name__) @@ -204,6 +206,8 @@ class GlueSourceReport(StaleEntityRemovalSourceReport): num_job_script_failed_parsing: int = 0 num_job_without_nodes: int = 0 num_dataset_to_dataset_edges_in_job: int = 0 + num_dataset_schema_invalid: int = 0 + num_dataset_buggy_delta_schema: int = 0 def report_table_scanned(self) -> None: self.tables_scanned += 1 @@ -1148,9 +1152,41 @@ def get_s3_tags() -> Optional[GlobalTagsClass]: return new_tags def get_schema_metadata() -> Optional[SchemaMetadata]: - if not table.get("StorageDescriptor"): + def is_delta_schema( + columns: Optional[List[Mapping[str, Any]]] + ) -> bool: + return ( + columns is not None + and (len(columns) == 1) + and (columns[0].get("Name", "") == "col") + and (columns[0].get("Type", "") == "array") + ) + + # https://github.com/delta-io/delta/pull/2310 + provider = table.get("Parameters", {}).get("spark.sql.sources.provider", "") + num_parts = int( + table.get("Parameters", {}).get( + "spark.sql.sources.schema.numParts", "0" + ) + ) + columns = table.get("StorageDescriptor", {}).get("Columns", [{}]) + + if ( + (provider == "delta") + and (num_parts > 0) + and is_delta_schema(columns) + ): + return _get_delta_schema_metadata() + + elif table.get("StorageDescriptor"): + return _get_glue_schema_metadata() + + else: return None + def _get_glue_schema_metadata() -> Optional[SchemaMetadata]: + assert table.get("StorageDescriptor") + schema = table["StorageDescriptor"]["Columns"] fields: List[SchemaField] = [] for field in schema: @@ -1183,6 +1219,51 @@ def get_schema_metadata() -> Optional[SchemaMetadata]: platformSchema=MySqlDDL(tableSchema=""), ) + def _get_delta_schema_metadata() -> Optional[SchemaMetadata]: + assert ( + table["Parameters"]["spark.sql.sources.provider"] == "delta" + and int(table["Parameters"]["spark.sql.sources.schema.numParts"]) > 0 + ) + + try: + numParts = int(table["Parameters"]["spark.sql.sources.schema.numParts"]) + schema_str = "".join( + [ + table["Parameters"][f"spark.sql.sources.schema.part.{i}"] + for i in range(numParts) + ] + ) + schema_json = json.loads(schema_str) + fields: List[SchemaField] = [] + for field in schema_json["fields"]: + field_type = delta_type_to_hive_type(field.get("type", "unknown")) + schema_fields = get_schema_fields_for_hive_column( + hive_column_name=field["name"], + hive_column_type=field_type, + description=field.get("description"), + default_nullable=bool(field.get("nullable", True)), + ) + assert schema_fields + fields.extend(schema_fields) + + self.report.num_dataset_buggy_delta_schema += 1 + return SchemaMetadata( + schemaName=table_name, + version=0, + fields=fields, + platform=f"urn:li:dataPlatform:{self.platform}", + hash="", + platformSchema=MySqlDDL(tableSchema=""), + ) + + except Exception as e: + self.report_warning( + dataset_urn, + f"Could not parse schema for {table_name} because of {type(e).__name__}: {e}", + ) + self.report.num_dataset_schema_invalid += 1 + return None + def get_data_platform_instance() -> DataPlatformInstanceClass: return DataPlatformInstanceClass( platform=make_data_platform_urn(self.platform), diff --git a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/source.py b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/source.py index 39066b0c26553..146b2d3bc2cf1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/source.py @@ -2,7 +2,7 @@ import logging import os import time -from typing import Any, Dict, Iterable, List +from typing import Dict, Iterable, List from urllib.parse import urlparse from deltalake import DeltaTable @@ -51,6 +51,7 @@ SchemaFieldClass, ) from datahub.telemetry import telemetry +from datahub.utilities.delta import delta_type_to_hive_type from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column logging.getLogger("py4j").setLevel(logging.ERROR) @@ -126,46 +127,12 @@ def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source": config = DeltaLakeSourceConfig.parse_obj(config_dict) return cls(config, ctx) - def delta_type_to_hive_type(self, field_type: Any) -> str: - if isinstance(field_type, str): - """ - return the field type - """ - return field_type - else: - if field_type.get("type") == "array": - """ - if array is of complex type, recursively parse the - fields and create the native datatype - """ - return ( - "array<" - + self.delta_type_to_hive_type(field_type.get("elementType")) - + ">" - ) - elif field_type.get("type") == "struct": - parsed_struct = "" - for field in field_type.get("fields"): - """ - if field is of complex type, recursively parse - and create the native datatype - """ - parsed_struct += ( - "{0}:{1}".format( - field.get("name"), - self.delta_type_to_hive_type(field.get("type")), - ) - + "," - ) - return "struct<" + parsed_struct.rstrip(",") + ">" - return "" - def _parse_datatype(self, raw_field_json_str: str) -> List[SchemaFieldClass]: raw_field_json = json.loads(raw_field_json_str) # get the parent field name and type field_name = raw_field_json.get("name") - field_type = self.delta_type_to_hive_type(raw_field_json.get("type")) + field_type = delta_type_to_hive_type(raw_field_json.get("type")) return get_schema_fields_for_hive_column(field_name, field_type) diff --git a/metadata-ingestion/src/datahub/utilities/delta.py b/metadata-ingestion/src/datahub/utilities/delta.py new file mode 100644 index 0000000000000..170c0a4508a19 --- /dev/null +++ b/metadata-ingestion/src/datahub/utilities/delta.py @@ -0,0 +1,34 @@ +from typing import Any + + +def delta_type_to_hive_type(field_type: Any) -> str: + if isinstance(field_type, str): + """ + return the field type + """ + return field_type + else: + if field_type.get("type") == "array": + """ + if array is of complex type, recursively parse the + fields and create the native datatype + """ + return ( + "array<" + delta_type_to_hive_type(field_type.get("elementType")) + ">" + ) + elif field_type.get("type") == "struct": + parsed_struct = "" + for field in field_type.get("fields"): + """ + if field is of complex type, recursively parse + and create the native datatype + """ + parsed_struct += ( + "{0}:{1}".format( + field.get("name"), + delta_type_to_hive_type(field.get("type")), + ) + + "," + ) + return "struct<" + parsed_struct.rstrip(",") + ">" + return "" diff --git a/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py b/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py index 4fcef990ae4f4..cdca1d958e0e8 100644 --- a/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py +++ b/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py @@ -28,6 +28,7 @@ class HiveColumnToAvroConverter: "bigint": "long", "varchar": "string", "char": "string", + "long": "long", } _COMPLEX_TYPE = re.compile("^(struct|map|array|uniontype)") diff --git a/metadata-ingestion/tests/unit/glue/glue_delta_mces_golden.json b/metadata-ingestion/tests/unit/glue/glue_delta_mces_golden.json new file mode 100644 index 0000000000000..f3d4812b79124 --- /dev/null +++ b/metadata-ingestion/tests/unit/glue/glue_delta_mces_golden.json @@ -0,0 +1,1548 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:275c7ea5ecf956fd8d45e14228757a8a", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "glue", + "instance": "delta_platform_instance", + "env": "PROD", + "database": "delta-database" + }, + "name": "delta-database", + "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/delta-database" + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:275c7ea5ecf956fd8d45e14228757a8a", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:275c7ea5ecf956fd8d45e14228757a8a", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:glue", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:glue,delta_platform_instance)" + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:275c7ea5ecf956fd8d45e14228757a8a", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,delta_platform_instance.delta-database.delta_table_1,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "spark.sql.sources.provider": "delta", + "spark.sql.sources.schema.numParts": "3", + "spark.sql.sources.schema.part.0": "{\"type\":\"struct\",\"fields\":[{\"name\":\"ecg_session_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"page_type_txt\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_pltfrm_id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_dvic_id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"ga_vstr_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"src_ad_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_ad_id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_user_id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"src_categ_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_categ_ref_id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_geo_ref_id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"src_loc_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"geo_region_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"geo_cntry_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"geo_city_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"ga_prfl_id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_ga_prfl_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"ga_vst_id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"app_vrsn_txt\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"chnl_group\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_trffc_chnl_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"vst_mdm_txt\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"vst_src_txt\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"vst_src_cmpgn_code\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"vst_src_cmpgn_txt\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"vst_src_cntnt_txt\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"vst_src_ad_kywrd_txt\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_vst_drtn_num\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"home_page_txt\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"session_start_time_num\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"brwsr_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"brwsr_vrsn_txt\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_is_user_login_flag\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"lang_code\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_is_direct_flag\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"os_vrsn_txt\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"os_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"host_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"vst_src_is_direct_flag\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_is_session_flag\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"app_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"encypted_user_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decypted_user_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"encrptd_email\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"page_path_txt\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"scrn_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"socl_engmnt_type\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"vst_src_path_txt\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"adword_user_id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"adword_cmpgn_id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"adword_adgroup_id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"adword_crtv_id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"adword_criteria_id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"adword_criteria_param_txt\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"adword_page_num\",\"type\":\"long\",\"nullable\":true,\"metadata\":{", + "spark.sql.sources.schema.part.1": "}},{\"name\":\"adword_slot_txt\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"adword_click_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"adword_ntwrk_type\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"adword_is_videoad_flag\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"adword_criteria_boomuserlist_id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"brwsr_size_txt\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"mbl_dvic_brand_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"mbl_dvic_model_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"mbl_input_slctr_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"mbl_dvic_info_txt\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"mbl_dvic_mkt_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"flash_vrsn_txt\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"is_java_enabled_flag\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"scrn_color_txt\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"scrn_rsln_txt\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"geo_cntint_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"geo_subcntint_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"geo_metro_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"geo_city_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"geo_ntwrk_dmn_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"geo_ltitd\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"geo_lngtd\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"geo_ntwrk_loc_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"session_ab_test_group_txt\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_vst_cnt\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_new_vstr_cnt\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_vst_num\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_hit_cnt\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_pv_cnt\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_srp_pv_cnt\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_unq_srp_pv_cnt\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_vip_pv_cnt\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_unq_vip_pv_cnt\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_scrn_view_cnt\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_uniq_scrn_view_cnt\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_scrn_drtn_num\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_bnc_cnt\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_trxn_cnt\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_trxn_rev_amt\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"ga_session_list_array\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"struct\",\"fields\":[{\"name\":\"clsfd_session_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_sum_dt\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}}]},\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"sess_cd\",\"type\":{\"type\":\"map\",\"keyType\":\"integer\",\"valueType\":\"string\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"lp_hit_cd\",\"type\":{\"type\":\"map\",\"keyType\":\"integer\",\"valueType\":\"string\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"ext_map\",\"type\":{\"type\":\"map\",\"keyType\":\"integer\",\"valueType\":\"string\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_cntry_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"cre_date\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}},{\"name\":\"cre_user\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"upd_date\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}},{\"name\":\"upd_user\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clsfd_site_id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"", + "spark.sql.sources.schema.part.2": "name\":\"ecg_session_start_dt\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}}]}", + "Location": "s3://crawler-public-us-west-2/delta/" + }, + "name": "delta_table_1", + "qualifiedName": "arn:aws:glue:us-west-2:123412341234:table/delta-database/delta_table_1", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "delta-database.delta_table_1", + "platform": "urn:li:dataPlatform:glue", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=string].ecg_session_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].page_type_txt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].clsfd_pltfrm_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"integer\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].clsfd_dvic_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"integer\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].ga_vstr_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].src_ad_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].clsfd_ad_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].clsfd_user_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].src_categ_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].clsfd_categ_ref_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"integer\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].clsfd_geo_ref_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"integer\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].src_loc_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].geo_region_name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].geo_cntry_name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].geo_city_name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].ga_prfl_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"integer\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].clsfd_ga_prfl_name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].ga_vst_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"integer\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].app_vrsn_txt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].chnl_group", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].clsfd_trffc_chnl_name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].vst_mdm_txt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].vst_src_txt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].vst_src_cmpgn_code", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].vst_src_cmpgn_txt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].vst_src_cntnt_txt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].vst_src_ad_kywrd_txt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].clsfd_vst_drtn_num", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].home_page_txt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].session_start_time_num", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"integer\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].brwsr_name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].brwsr_vrsn_txt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].clsfd_is_user_login_flag", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"integer\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].lang_code", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].clsfd_is_direct_flag", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"integer\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].os_vrsn_txt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].os_name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].host_name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].vst_src_is_direct_flag", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"integer\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].clsfd_is_session_flag", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"integer\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].app_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].encypted_user_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].decypted_user_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].encrptd_email", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].page_path_txt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].scrn_name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].socl_engmnt_type", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].vst_src_path_txt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].adword_user_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].adword_cmpgn_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].adword_adgroup_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].adword_crtv_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].adword_criteria_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].adword_criteria_param_txt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].adword_page_num", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].adword_slot_txt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].adword_click_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].adword_ntwrk_type", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].adword_is_videoad_flag", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"integer\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].adword_criteria_boomuserlist_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].brwsr_size_txt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].mbl_dvic_brand_name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].mbl_dvic_model_name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].mbl_input_slctr_name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].mbl_dvic_info_txt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].mbl_dvic_mkt_name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].flash_vrsn_txt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].is_java_enabled_flag", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"integer\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].scrn_color_txt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].scrn_rsln_txt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].geo_cntint_name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].geo_subcntint_name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].geo_metro_name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].geo_city_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].geo_ntwrk_dmn_name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].geo_ltitd", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].geo_lngtd", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].geo_ntwrk_loc_name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].session_ab_test_group_txt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].clsfd_vst_cnt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].clsfd_new_vstr_cnt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].clsfd_vst_num", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].clsfd_hit_cnt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].clsfd_pv_cnt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].clsfd_srp_pv_cnt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].clsfd_unq_srp_pv_cnt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].clsfd_vip_pv_cnt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].clsfd_unq_vip_pv_cnt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].clsfd_scrn_view_cnt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].clsfd_uniq_scrn_view_cnt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].clsfd_scrn_drtn_num", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].clsfd_bnc_cnt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].clsfd_trxn_cnt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=long].clsfd_trxn_rev_amt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].ga_session_list_array", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "record" + ] + } + } + }, + "nativeDataType": "array>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array>\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].ga_session_list_array.[type=string].clsfd_session_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].ga_session_list_array.[type=int].clsfd_sum_dt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": {} + } + }, + "nativeDataType": "date", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"logicalType\": \"date\", \"native_data_type\": \"date\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=null].sess_cd", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=null].lp_hit_cd", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=null].ext_map", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].clsfd_cntry_name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].cre_date", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": {} + } + }, + "nativeDataType": "date", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"logicalType\": \"date\", \"native_data_type\": \"date\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].cre_user", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].upd_date", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": {} + } + }, + "nativeDataType": "date", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"logicalType\": \"date\", \"native_data_type\": \"date\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].upd_user", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].clsfd_site_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"integer\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].ecg_session_start_dt", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": {} + } + }, + "nativeDataType": "date", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"logicalType\": \"date\", \"native_data_type\": \"date\", \"_nullable\": true}" + } + ] + } + }, + { + "com.linkedin.pegasus2avro.common.DataPlatformInstance": { + "platform": "urn:li:dataPlatform:glue", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:glue,delta_platform_instance)" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:owner", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,delta_platform_instance.delta-database.delta_table_1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,delta_platform_instance.delta-database.delta_table_1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:275c7ea5ecf956fd8d45e14228757a8a" + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/glue/glue_malformed_delta_mces_golden.json b/metadata-ingestion/tests/unit/glue/glue_malformed_delta_mces_golden.json new file mode 100644 index 0000000000000..015daaa27162f --- /dev/null +++ b/metadata-ingestion/tests/unit/glue/glue_malformed_delta_mces_golden.json @@ -0,0 +1,128 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:275c7ea5ecf956fd8d45e14228757a8a", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "glue", + "instance": "delta_platform_instance", + "env": "PROD", + "database": "delta-database" + }, + "name": "delta-database", + "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/delta-database" + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:275c7ea5ecf956fd8d45e14228757a8a", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:275c7ea5ecf956fd8d45e14228757a8a", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:glue", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:glue,delta_platform_instance)" + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:275c7ea5ecf956fd8d45e14228757a8a", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,delta_platform_instance.delta-database.delta_table_1,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "spark.sql.sources.provider": "delta", + "spark.sql.sources.schema.numParts": "1", + "spark.sql.sources.schema.part.0": "this is totally wrong!", + "Location": "s3://crawler-public-us-west-2/delta/" + }, + "name": "delta_table_1", + "qualifiedName": "arn:aws:glue:us-west-2:123412341234:table/delta-database/delta_table_1", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.common.DataPlatformInstance": { + "platform": "urn:li:dataPlatform:glue", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:glue,delta_platform_instance)" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:owner", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,delta_platform_instance.delta-database.delta_table_1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,delta_platform_instance.delta-database.delta_table_1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:275c7ea5ecf956fd8d45e14228757a8a" + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/test_glue_source.py b/metadata-ingestion/tests/unit/test_glue_source.py index 8fb840ee003c7..be2a6d3e214f0 100644 --- a/metadata-ingestion/tests/unit/test_glue_source.py +++ b/metadata-ingestion/tests/unit/test_glue_source.py @@ -33,11 +33,15 @@ databases_1, databases_2, get_bucket_tagging, + get_databases_delta_response, get_databases_response, get_databases_response_with_resource_link, get_dataflow_graph_response_1, get_dataflow_graph_response_2, + get_delta_tables_response_1, + get_delta_tables_response_2, get_jobs_response, + get_jobs_response_empty, get_object_body_1, get_object_body_2, get_object_response_1, @@ -57,15 +61,19 @@ GMS_SERVER = f"http://localhost:{GMS_PORT}" -def glue_source(platform_instance: Optional[str] = None) -> GlueSource: +def glue_source( + platform_instance: Optional[str] = None, + use_s3_bucket_tags: bool = True, + use_s3_object_tags: bool = True, +) -> GlueSource: return GlueSource( ctx=PipelineContext(run_id="glue-source-test"), config=GlueSourceConfig( aws_region="us-west-2", extract_transforms=True, platform_instance=platform_instance, - use_s3_bucket_tags=True, - use_s3_object_tags=True, + use_s3_bucket_tags=use_s3_bucket_tags, + use_s3_object_tags=use_s3_object_tags, ), ) @@ -336,3 +344,75 @@ def test_glue_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph): "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", } + + +def test_glue_with_delta_schema_ingest( + tmp_path: Path, + pytestconfig: PytestConfig, +) -> None: + glue_source_instance = glue_source( + platform_instance="delta_platform_instance", + use_s3_bucket_tags=False, + use_s3_object_tags=False, + ) + + with Stubber(glue_source_instance.glue_client) as glue_stubber: + glue_stubber.add_response("get_databases", get_databases_delta_response, {}) + glue_stubber.add_response( + "get_tables", + get_delta_tables_response_1, + {"DatabaseName": "delta-database"}, + ) + glue_stubber.add_response("get_jobs", get_jobs_response_empty, {}) + + mce_objects = [wu.metadata for wu in glue_source_instance.get_workunits()] + + glue_stubber.assert_no_pending_responses() + + assert glue_source_instance.get_report().num_dataset_buggy_delta_schema == 1 + + write_metadata_file(tmp_path / "glue_delta_mces.json", mce_objects) + + # Verify the output. + test_resources_dir = pytestconfig.rootpath / "tests/unit/glue" + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / "glue_delta_mces.json", + golden_path=test_resources_dir / "glue_delta_mces_golden.json", + ) + + +def test_glue_with_malformed_delta_schema_ingest( + tmp_path: Path, + pytestconfig: PytestConfig, +) -> None: + glue_source_instance = glue_source( + platform_instance="delta_platform_instance", + use_s3_bucket_tags=False, + use_s3_object_tags=False, + ) + + with Stubber(glue_source_instance.glue_client) as glue_stubber: + glue_stubber.add_response("get_databases", get_databases_delta_response, {}) + glue_stubber.add_response( + "get_tables", + get_delta_tables_response_2, + {"DatabaseName": "delta-database"}, + ) + glue_stubber.add_response("get_jobs", get_jobs_response_empty, {}) + + mce_objects = [wu.metadata for wu in glue_source_instance.get_workunits()] + + glue_stubber.assert_no_pending_responses() + + assert glue_source_instance.get_report().num_dataset_schema_invalid == 1 + + write_metadata_file(tmp_path / "glue_malformed_delta_mces.json", mce_objects) + + # Verify the output. + test_resources_dir = pytestconfig.rootpath / "tests/unit/glue" + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / "glue_malformed_delta_mces.json", + golden_path=test_resources_dir / "glue_malformed_delta_mces_golden.json", + ) \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/test_glue_source_stubs.py b/metadata-ingestion/tests/unit/test_glue_source_stubs.py index 771a0389c1e65..3eb5833fb1157 100644 --- a/metadata-ingestion/tests/unit/test_glue_source_stubs.py +++ b/metadata-ingestion/tests/unit/test_glue_source_stubs.py @@ -271,6 +271,9 @@ }, ] get_tables_response_2 = {"TableList": tables_2} +get_jobs_response_empty = { + "Jobs": [], +} get_jobs_response = { "Jobs": [ { @@ -787,6 +790,81 @@ job.commit() """ +get_databases_delta_response = { + "DatabaseList": [ + { + "Name": "delta-database", + "CreateTime": datetime.datetime(2021, 6, 9, 14, 14, 19), + "CreateTableDefaultPermissions": [ + { + "Principal": { + "DataLakePrincipalIdentifier": "IAM_ALLOWED_PRINCIPALS" + }, + "Permissions": ["ALL"], + } + ], + "CatalogId": "123412341234", + }, + ] +} +delta_tables_1 = [ + { + "Name": "delta_table_1", + "DatabaseName": "delta-database", + "Owner": "owner", + "CreateTime": datetime.datetime(2021, 6, 9, 14, 17, 35), + "UpdateTime": datetime.datetime(2021, 6, 9, 14, 17, 35), + "LastAccessTime": datetime.datetime(2021, 6, 9, 14, 17, 35), + "Retention": 0, + "StorageDescriptor": { + "Columns": [ + {"Name": "col", "Type": "array", "Comment": "some comment"}, + ], + "Location": "s3://crawler-public-us-west-2/delta/", + }, + "TableType": "EXTERNAL_TABLE", + "Parameters": { + "spark.sql.sources.provider": "delta", + "spark.sql.sources.schema.numParts": "3", + "spark.sql.sources.schema.part.0": '{"type":"struct","fields":[{"name":"ecg_session_id","type":"string","nullable":true,"metadata":{}},{"name":"page_type_txt","type":"string","nullable":true,"metadata":{}},{"name":"clsfd_pltfrm_id","type":"integer","nullable":true,"metadata":{}},{"name":"clsfd_dvic_id","type":"integer","nullable":true,"metadata":{}},{"name":"ga_vstr_id","type":"string","nullable":true,"metadata":{}},{"name":"src_ad_id","type":"string","nullable":true,"metadata":{}},{"name":"clsfd_ad_id","type":"long","nullable":true,"metadata":{}},{"name":"clsfd_user_id","type":"long","nullable":true,"metadata":{}},{"name":"src_categ_id","type":"string","nullable":true,"metadata":{}},{"name":"clsfd_categ_ref_id","type":"integer","nullable":true,"metadata":{}},{"name":"clsfd_geo_ref_id","type":"integer","nullable":true,"metadata":{}},{"name":"src_loc_id","type":"string","nullable":true,"metadata":{}},{"name":"geo_region_name","type":"string","nullable":true,"metadata":{}},{"name":"geo_cntry_name","type":"string","nullable":true,"metadata":{}},{"name":"geo_city_name","type":"string","nullable":true,"metadata":{}},{"name":"ga_prfl_id","type":"integer","nullable":true,"metadata":{}},{"name":"clsfd_ga_prfl_name","type":"string","nullable":true,"metadata":{}},{"name":"ga_vst_id","type":"integer","nullable":true,"metadata":{}},{"name":"app_vrsn_txt","type":"string","nullable":true,"metadata":{}},{"name":"chnl_group","type":"string","nullable":true,"metadata":{}},{"name":"clsfd_trffc_chnl_name","type":"string","nullable":true,"metadata":{}},{"name":"vst_mdm_txt","type":"string","nullable":true,"metadata":{}},{"name":"vst_src_txt","type":"string","nullable":true,"metadata":{}},{"name":"vst_src_cmpgn_code","type":"string","nullable":true,"metadata":{}},{"name":"vst_src_cmpgn_txt","type":"string","nullable":true,"metadata":{}},{"name":"vst_src_cntnt_txt","type":"string","nullable":true,"metadata":{}},{"name":"vst_src_ad_kywrd_txt","type":"string","nullable":true,"metadata":{}},{"name":"clsfd_vst_drtn_num","type":"long","nullable":true,"metadata":{}},{"name":"home_page_txt","type":"string","nullable":true,"metadata":{}},{"name":"session_start_time_num","type":"integer","nullable":true,"metadata":{}},{"name":"brwsr_name","type":"string","nullable":true,"metadata":{}},{"name":"brwsr_vrsn_txt","type":"string","nullable":true,"metadata":{}},{"name":"clsfd_is_user_login_flag","type":"integer","nullable":true,"metadata":{}},{"name":"lang_code","type":"string","nullable":true,"metadata":{}},{"name":"clsfd_is_direct_flag","type":"integer","nullable":true,"metadata":{}},{"name":"os_vrsn_txt","type":"string","nullable":true,"metadata":{}},{"name":"os_name","type":"string","nullable":true,"metadata":{}},{"name":"host_name","type":"string","nullable":true,"metadata":{}},{"name":"vst_src_is_direct_flag","type":"integer","nullable":true,"metadata":{}},{"name":"clsfd_is_session_flag","type":"integer","nullable":true,"metadata":{}},{"name":"app_id","type":"string","nullable":true,"metadata":{}},{"name":"encypted_user_id","type":"string","nullable":true,"metadata":{}},{"name":"decypted_user_id","type":"string","nullable":true,"metadata":{}},{"name":"encrptd_email","type":"string","nullable":true,"metadata":{}},{"name":"page_path_txt","type":"string","nullable":true,"metadata":{}},{"name":"scrn_name","type":"string","nullable":true,"metadata":{}},{"name":"socl_engmnt_type","type":"string","nullable":true,"metadata":{}},{"name":"vst_src_path_txt","type":"string","nullable":true,"metadata":{}},{"name":"adword_user_id","type":"long","nullable":true,"metadata":{}},{"name":"adword_cmpgn_id","type":"long","nullable":true,"metadata":{}},{"name":"adword_adgroup_id","type":"long","nullable":true,"metadata":{}},{"name":"adword_crtv_id","type":"long","nullable":true,"metadata":{}},{"name":"adword_criteria_id","type":"long","nullable":true,"metadata":{}},{"name":"adword_criteria_param_txt","type":"string","nullable":true,"metadata":{}},{"name":"adword_page_num","type":"long","nullable":true,"metadata":{', + "spark.sql.sources.schema.part.1": '}},{"name":"adword_slot_txt","type":"string","nullable":true,"metadata":{}},{"name":"adword_click_id","type":"string","nullable":true,"metadata":{}},{"name":"adword_ntwrk_type","type":"string","nullable":true,"metadata":{}},{"name":"adword_is_videoad_flag","type":"integer","nullable":true,"metadata":{}},{"name":"adword_criteria_boomuserlist_id","type":"long","nullable":true,"metadata":{}},{"name":"brwsr_size_txt","type":"string","nullable":true,"metadata":{}},{"name":"mbl_dvic_brand_name","type":"string","nullable":true,"metadata":{}},{"name":"mbl_dvic_model_name","type":"string","nullable":true,"metadata":{}},{"name":"mbl_input_slctr_name","type":"string","nullable":true,"metadata":{}},{"name":"mbl_dvic_info_txt","type":"string","nullable":true,"metadata":{}},{"name":"mbl_dvic_mkt_name","type":"string","nullable":true,"metadata":{}},{"name":"flash_vrsn_txt","type":"string","nullable":true,"metadata":{}},{"name":"is_java_enabled_flag","type":"integer","nullable":true,"metadata":{}},{"name":"scrn_color_txt","type":"string","nullable":true,"metadata":{}},{"name":"scrn_rsln_txt","type":"string","nullable":true,"metadata":{}},{"name":"geo_cntint_name","type":"string","nullable":true,"metadata":{}},{"name":"geo_subcntint_name","type":"string","nullable":true,"metadata":{}},{"name":"geo_metro_name","type":"string","nullable":true,"metadata":{}},{"name":"geo_city_id","type":"string","nullable":true,"metadata":{}},{"name":"geo_ntwrk_dmn_name","type":"string","nullable":true,"metadata":{}},{"name":"geo_ltitd","type":"string","nullable":true,"metadata":{}},{"name":"geo_lngtd","type":"string","nullable":true,"metadata":{}},{"name":"geo_ntwrk_loc_name","type":"string","nullable":true,"metadata":{}},{"name":"session_ab_test_group_txt","type":"string","nullable":true,"metadata":{}},{"name":"clsfd_vst_cnt","type":"long","nullable":true,"metadata":{}},{"name":"clsfd_new_vstr_cnt","type":"long","nullable":true,"metadata":{}},{"name":"clsfd_vst_num","type":"long","nullable":true,"metadata":{}},{"name":"clsfd_hit_cnt","type":"long","nullable":true,"metadata":{}},{"name":"clsfd_pv_cnt","type":"long","nullable":true,"metadata":{}},{"name":"clsfd_srp_pv_cnt","type":"long","nullable":true,"metadata":{}},{"name":"clsfd_unq_srp_pv_cnt","type":"long","nullable":true,"metadata":{}},{"name":"clsfd_vip_pv_cnt","type":"long","nullable":true,"metadata":{}},{"name":"clsfd_unq_vip_pv_cnt","type":"long","nullable":true,"metadata":{}},{"name":"clsfd_scrn_view_cnt","type":"long","nullable":true,"metadata":{}},{"name":"clsfd_uniq_scrn_view_cnt","type":"long","nullable":true,"metadata":{}},{"name":"clsfd_scrn_drtn_num","type":"long","nullable":true,"metadata":{}},{"name":"clsfd_bnc_cnt","type":"long","nullable":true,"metadata":{}},{"name":"clsfd_trxn_cnt","type":"long","nullable":true,"metadata":{}},{"name":"clsfd_trxn_rev_amt","type":"long","nullable":true,"metadata":{}},{"name":"ga_session_list_array","type":{"type":"array","elementType":{"type":"struct","fields":[{"name":"clsfd_session_id","type":"string","nullable":true,"metadata":{}},{"name":"clsfd_sum_dt","type":"date","nullable":true,"metadata":{}}]},"containsNull":true},"nullable":true,"metadata":{}},{"name":"sess_cd","type":{"type":"map","keyType":"integer","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"lp_hit_cd","type":{"type":"map","keyType":"integer","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"ext_map","type":{"type":"map","keyType":"integer","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"clsfd_cntry_name","type":"string","nullable":true,"metadata":{}},{"name":"cre_date","type":"date","nullable":true,"metadata":{}},{"name":"cre_user","type":"string","nullable":true,"metadata":{}},{"name":"upd_date","type":"date","nullable":true,"metadata":{}},{"name":"upd_user","type":"string","nullable":true,"metadata":{}},{"name":"clsfd_site_id","type":"integer","nullable":true,"metadata":{}},{"', + "spark.sql.sources.schema.part.2": 'name":"ecg_session_start_dt","type":"date","nullable":true,"metadata":{}}]}', + }, + "CreatedBy": "arn:aws:sts::123412341234:assumed-role/AWSGlueServiceRole-flights-crawler/AWS-Crawler", + "IsRegisteredWithLakeFormation": False, + "CatalogId": "123412341234", + } +] +get_delta_tables_response_1 = {"TableList": delta_tables_1} + +delta_tables_2 = [ + { + "Name": "delta_table_1", + "DatabaseName": "delta-database", + "Owner": "owner", + "CreateTime": datetime.datetime(2021, 6, 9, 14, 17, 35), + "UpdateTime": datetime.datetime(2021, 6, 9, 14, 17, 35), + "LastAccessTime": datetime.datetime(2021, 6, 9, 14, 17, 35), + "Retention": 0, + "StorageDescriptor": { + "Columns": [ + {"Name": "col", "Type": "array", "Comment": "some comment"}, + ], + "Location": "s3://crawler-public-us-west-2/delta/", + }, + "TableType": "EXTERNAL_TABLE", + "Parameters": { + "spark.sql.sources.provider": "delta", + "spark.sql.sources.schema.numParts": "1", + "spark.sql.sources.schema.part.0": "this is totally wrong!", + }, + "CreatedBy": "arn:aws:sts::123412341234:assumed-role/AWSGlueServiceRole-flights-crawler/AWS-Crawler", + "IsRegisteredWithLakeFormation": False, + "CatalogId": "123412341234", + } +] +get_delta_tables_response_2 = {"TableList": delta_tables_2} + def mock_get_object_response(raw_body: str) -> Dict[str, Any]: """ From ac351e9bcefe77f2cdd40adce5526cddc4371cf1 Mon Sep 17 00:00:00 2001 From: Sergio Gomez Villamor Date: Tue, 16 Apr 2024 19:41:40 +0200 Subject: [PATCH 2/7] lint fix --- .../src/datahub/ingestion/source/aws/glue.py | 10 ++-------- .../tests/unit/test_glue_source.py | 16 ++++++++-------- .../tests/unit/test_glue_source_stubs.py | 2 +- 3 files changed, 11 insertions(+), 17 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index 50741e056cbf2..4b81e3cf6f3c0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -1152,9 +1152,7 @@ def get_s3_tags() -> Optional[GlobalTagsClass]: return new_tags def get_schema_metadata() -> Optional[SchemaMetadata]: - def is_delta_schema( - columns: Optional[List[Mapping[str, Any]]] - ) -> bool: + def is_delta_schema(columns: Optional[List[Mapping[str, Any]]]) -> bool: return ( columns is not None and (len(columns) == 1) @@ -1171,11 +1169,7 @@ def is_delta_schema( ) columns = table.get("StorageDescriptor", {}).get("Columns", [{}]) - if ( - (provider == "delta") - and (num_parts > 0) - and is_delta_schema(columns) - ): + if (provider == "delta") and (num_parts > 0) and is_delta_schema(columns): return _get_delta_schema_metadata() elif table.get("StorageDescriptor"): diff --git a/metadata-ingestion/tests/unit/test_glue_source.py b/metadata-ingestion/tests/unit/test_glue_source.py index be2a6d3e214f0..65c88064152db 100644 --- a/metadata-ingestion/tests/unit/test_glue_source.py +++ b/metadata-ingestion/tests/unit/test_glue_source.py @@ -62,9 +62,9 @@ def glue_source( - platform_instance: Optional[str] = None, - use_s3_bucket_tags: bool = True, - use_s3_object_tags: bool = True, + platform_instance: Optional[str] = None, + use_s3_bucket_tags: bool = True, + use_s3_object_tags: bool = True, ) -> GlueSource: return GlueSource( ctx=PipelineContext(run_id="glue-source-test"), @@ -347,8 +347,8 @@ def test_glue_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph): def test_glue_with_delta_schema_ingest( - tmp_path: Path, - pytestconfig: PytestConfig, + tmp_path: Path, + pytestconfig: PytestConfig, ) -> None: glue_source_instance = glue_source( platform_instance="delta_platform_instance", @@ -383,8 +383,8 @@ def test_glue_with_delta_schema_ingest( def test_glue_with_malformed_delta_schema_ingest( - tmp_path: Path, - pytestconfig: PytestConfig, + tmp_path: Path, + pytestconfig: PytestConfig, ) -> None: glue_source_instance = glue_source( platform_instance="delta_platform_instance", @@ -415,4 +415,4 @@ def test_glue_with_malformed_delta_schema_ingest( pytestconfig, output_path=tmp_path / "glue_malformed_delta_mces.json", golden_path=test_resources_dir / "glue_malformed_delta_mces_golden.json", - ) \ No newline at end of file + ) diff --git a/metadata-ingestion/tests/unit/test_glue_source_stubs.py b/metadata-ingestion/tests/unit/test_glue_source_stubs.py index 3eb5833fb1157..c971001f97072 100644 --- a/metadata-ingestion/tests/unit/test_glue_source_stubs.py +++ b/metadata-ingestion/tests/unit/test_glue_source_stubs.py @@ -271,7 +271,7 @@ }, ] get_tables_response_2 = {"TableList": tables_2} -get_jobs_response_empty = { +get_jobs_response_empty: Dict[str, Any] = { "Jobs": [], } get_jobs_response = { From 21b3c4773f3aac54767cd5291d6d249bc88a1346 Mon Sep 17 00:00:00 2001 From: Sergio Gomez Villamor Date: Wed, 17 Apr 2024 09:12:48 +0200 Subject: [PATCH 3/7] update delta golden files --- ...den_mces_tables_with_nested_datatypes.json | 409 +----------------- 1 file changed, 9 insertions(+), 400 deletions(-) diff --git a/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_tables_with_nested_datatypes.json b/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_tables_with_nested_datatypes.json index c8bf54efa46c2..fbf4578ef6589 100644 --- a/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_tables_with_nested_datatypes.json +++ b/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_tables_with_nested_datatypes.json @@ -96,11 +96,11 @@ "jsonProps": "{\"native_data_type\": \"struct<_1:long,_2:string>\"}" }, { - "fieldPath": "[version=2.0].[type=struct].[type=struct].data.[type=struct]._2.[type=null]._1", + "fieldPath": "[version=2.0].[type=struct].[type=struct].data.[type=struct]._2.[type=long]._1", "nullable": true, "type": { "type": { - "com.linkedin.pegasus2avro.schema.NullType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, "nativeDataType": "long", @@ -896,11 +896,11 @@ "jsonProps": "{\"native_data_type\": \"struct<_1:long,_2:string>\"}" }, { - "fieldPath": "[version=2.0].[type=struct].[type=struct].data.[type=struct]._2.[type=null]._1", + "fieldPath": "[version=2.0].[type=struct].[type=struct].data.[type=struct]._2.[type=long]._1", "nullable": true, "type": { "type": { - "com.linkedin.pegasus2avro.schema.NullType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, "nativeDataType": "long", @@ -1491,11 +1491,11 @@ "jsonProps": "{\"native_data_type\": \"struct<_1:long,_2:string>\"}" }, { - "fieldPath": "[version=2.0].[type=struct].[type=struct].data.[type=struct]._1.[type=null]._1", + "fieldPath": "[version=2.0].[type=struct].[type=struct].data.[type=struct]._1.[type=long]._1", "nullable": true, "type": { "type": { - "com.linkedin.pegasus2avro.schema.NullType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, "nativeDataType": "long", @@ -1530,11 +1530,11 @@ "jsonProps": "{\"native_data_type\": \"struct<_1:long,_2:string>\"}" }, { - "fieldPath": "[version=2.0].[type=struct].[type=struct].data.[type=struct]._2.[type=null]._1", + "fieldPath": "[version=2.0].[type=struct].[type=struct].data.[type=struct]._2.[type=long]._1", "nullable": true, "type": { "type": { - "com.linkedin.pegasus2avro.schema.NullType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, "nativeDataType": "long", @@ -1717,7 +1717,7 @@ "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { - "fieldPath": "[version=2.0].[type=struct].[type=array].[type=array].[type=array].[type=null].data", + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=array].[type=array].[type=long].data", "nullable": false, "type": { "type": { @@ -1796,397 +1796,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:delta-lake,my-platform.tests/integration/delta_lake/test_data/delta_tables_nested_datatype/table_with_string_and_nested_array_of_numbers,UAT)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:delta-lake,my-platform)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:delta-lake,my-platform)" - }, - { - "id": "urn:li:container:189046201d696e7810132cfa64dad337", - "urn": "urn:li:container:189046201d696e7810132cfa64dad337" - }, - { - "id": "urn:li:container:acf0f3806f475a7397ee745329ef2967", - "urn": "urn:li:container:acf0f3806f475a7397ee745329ef2967" - }, - { - "id": "urn:li:container:1876d057d0ee364677b85427342e2c82", - "urn": "urn:li:container:1876d057d0ee364677b85427342e2c82" - }, - { - "id": "urn:li:container:7888b6dab77b7e77709699c9a1b81aa4", - "urn": "urn:li:container:7888b6dab77b7e77709699c9a1b81aa4" - }, - { - "id": "urn:li:container:401e53437a2ce6094ab3021cb32919d9", - "urn": "urn:li:container:401e53437a2ce6094ab3021cb32919d9" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "tables_with_nested_datatypes.json", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:189046201d696e7810132cfa64dad337", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:delta-lake,my-platform)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:delta-lake,my-platform)" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "tables_with_nested_datatypes.json", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:acf0f3806f475a7397ee745329ef2967", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:delta-lake,my-platform)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:delta-lake,my-platform)" - }, - { - "id": "urn:li:container:189046201d696e7810132cfa64dad337", - "urn": "urn:li:container:189046201d696e7810132cfa64dad337" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "tables_with_nested_datatypes.json", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:1876d057d0ee364677b85427342e2c82", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:delta-lake,my-platform)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:delta-lake,my-platform)" - }, - { - "id": "urn:li:container:189046201d696e7810132cfa64dad337", - "urn": "urn:li:container:189046201d696e7810132cfa64dad337" - }, - { - "id": "urn:li:container:acf0f3806f475a7397ee745329ef2967", - "urn": "urn:li:container:acf0f3806f475a7397ee745329ef2967" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "tables_with_nested_datatypes.json", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:7888b6dab77b7e77709699c9a1b81aa4", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:delta-lake,my-platform)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:delta-lake,my-platform)" - }, - { - "id": "urn:li:container:189046201d696e7810132cfa64dad337", - "urn": "urn:li:container:189046201d696e7810132cfa64dad337" - }, - { - "id": "urn:li:container:acf0f3806f475a7397ee745329ef2967", - "urn": "urn:li:container:acf0f3806f475a7397ee745329ef2967" - }, - { - "id": "urn:li:container:1876d057d0ee364677b85427342e2c82", - "urn": "urn:li:container:1876d057d0ee364677b85427342e2c82" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "tables_with_nested_datatypes.json", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:401e53437a2ce6094ab3021cb32919d9", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:delta-lake,my-platform)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:delta-lake,my-platform)" - }, - { - "id": "urn:li:container:189046201d696e7810132cfa64dad337", - "urn": "urn:li:container:189046201d696e7810132cfa64dad337" - }, - { - "id": "urn:li:container:acf0f3806f475a7397ee745329ef2967", - "urn": "urn:li:container:acf0f3806f475a7397ee745329ef2967" - }, - { - "id": "urn:li:container:1876d057d0ee364677b85427342e2c82", - "urn": "urn:li:container:1876d057d0ee364677b85427342e2c82" - }, - { - "id": "urn:li:container:7888b6dab77b7e77709699c9a1b81aa4", - "urn": "urn:li:container:7888b6dab77b7e77709699c9a1b81aa4" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "tables_with_nested_datatypes.json", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:delta-lake,my-platform.tests/integration/delta_lake/test_data/delta_tables_nested_datatype/table_with_nested_struct_1,UAT)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:delta-lake,my-platform)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:delta-lake,my-platform)" - }, - { - "id": "urn:li:container:189046201d696e7810132cfa64dad337", - "urn": "urn:li:container:189046201d696e7810132cfa64dad337" - }, - { - "id": "urn:li:container:acf0f3806f475a7397ee745329ef2967", - "urn": "urn:li:container:acf0f3806f475a7397ee745329ef2967" - }, - { - "id": "urn:li:container:1876d057d0ee364677b85427342e2c82", - "urn": "urn:li:container:1876d057d0ee364677b85427342e2c82" - }, - { - "id": "urn:li:container:7888b6dab77b7e77709699c9a1b81aa4", - "urn": "urn:li:container:7888b6dab77b7e77709699c9a1b81aa4" - }, - { - "id": "urn:li:container:401e53437a2ce6094ab3021cb32919d9", - "urn": "urn:li:container:401e53437a2ce6094ab3021cb32919d9" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "tables_with_nested_datatypes.json", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:delta-lake,my-platform.tests/integration/delta_lake/test_data/delta_tables_nested_datatype/table_with_nested_struct,UAT)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:delta-lake,my-platform)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:delta-lake,my-platform)" - }, - { - "id": "urn:li:container:189046201d696e7810132cfa64dad337", - "urn": "urn:li:container:189046201d696e7810132cfa64dad337" - }, - { - "id": "urn:li:container:acf0f3806f475a7397ee745329ef2967", - "urn": "urn:li:container:acf0f3806f475a7397ee745329ef2967" - }, - { - "id": "urn:li:container:1876d057d0ee364677b85427342e2c82", - "urn": "urn:li:container:1876d057d0ee364677b85427342e2c82" - }, - { - "id": "urn:li:container:7888b6dab77b7e77709699c9a1b81aa4", - "urn": "urn:li:container:7888b6dab77b7e77709699c9a1b81aa4" - }, - { - "id": "urn:li:container:401e53437a2ce6094ab3021cb32919d9", - "urn": "urn:li:container:401e53437a2ce6094ab3021cb32919d9" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "tables_with_nested_datatypes.json", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:delta-lake,my-platform.tests/integration/delta_lake/test_data/delta_tables_nested_datatype/table_with_string_and_array,UAT)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:delta-lake,my-platform)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:delta-lake,my-platform)" - }, - { - "id": "urn:li:container:189046201d696e7810132cfa64dad337", - "urn": "urn:li:container:189046201d696e7810132cfa64dad337" - }, - { - "id": "urn:li:container:acf0f3806f475a7397ee745329ef2967", - "urn": "urn:li:container:acf0f3806f475a7397ee745329ef2967" - }, - { - "id": "urn:li:container:1876d057d0ee364677b85427342e2c82", - "urn": "urn:li:container:1876d057d0ee364677b85427342e2c82" - }, - { - "id": "urn:li:container:7888b6dab77b7e77709699c9a1b81aa4", - "urn": "urn:li:container:7888b6dab77b7e77709699c9a1b81aa4" - }, - { - "id": "urn:li:container:401e53437a2ce6094ab3021cb32919d9", - "urn": "urn:li:container:401e53437a2ce6094ab3021cb32919d9" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "tables_with_nested_datatypes.json", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:delta-lake,my-platform.tests/integration/delta_lake/test_data/delta_tables_nested_datatype/table_with_string_and_array_of_struct,UAT)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:delta-lake,my-platform)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:delta-lake,my-platform)" - }, - { - "id": "urn:li:container:189046201d696e7810132cfa64dad337", - "urn": "urn:li:container:189046201d696e7810132cfa64dad337" - }, - { - "id": "urn:li:container:acf0f3806f475a7397ee745329ef2967", - "urn": "urn:li:container:acf0f3806f475a7397ee745329ef2967" - }, - { - "id": "urn:li:container:1876d057d0ee364677b85427342e2c82", - "urn": "urn:li:container:1876d057d0ee364677b85427342e2c82" - }, - { - "id": "urn:li:container:7888b6dab77b7e77709699c9a1b81aa4", - "urn": "urn:li:container:7888b6dab77b7e77709699c9a1b81aa4" - }, - { - "id": "urn:li:container:401e53437a2ce6094ab3021cb32919d9", - "urn": "urn:li:container:401e53437a2ce6094ab3021cb32919d9" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "tables_with_nested_datatypes.json", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:delta-lake,my-platform.tests/integration/delta_lake/test_data/delta_tables_nested_datatype/table_with_nested_struct_2,UAT)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:delta-lake,my-platform)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:delta-lake,my-platform)" - }, - { - "id": "urn:li:container:189046201d696e7810132cfa64dad337", - "urn": "urn:li:container:189046201d696e7810132cfa64dad337" - }, - { - "id": "urn:li:container:acf0f3806f475a7397ee745329ef2967", - "urn": "urn:li:container:acf0f3806f475a7397ee745329ef2967" - }, - { - "id": "urn:li:container:1876d057d0ee364677b85427342e2c82", - "urn": "urn:li:container:1876d057d0ee364677b85427342e2c82" - }, - { - "id": "urn:li:container:7888b6dab77b7e77709699c9a1b81aa4", - "urn": "urn:li:container:7888b6dab77b7e77709699c9a1b81aa4" - }, - { - "id": "urn:li:container:401e53437a2ce6094ab3021cb32919d9", - "urn": "urn:li:container:401e53437a2ce6094ab3021cb32919d9" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "tables_with_nested_datatypes.json", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:delta-lake,my-platform.tests/integration/delta_lake/test_data/delta_tables_nested_datatype/table_with_string_and_nested_array_of_numbers,UAT)", From d634b10870bd0408cf3e5497c235bb5f0c4070a8 Mon Sep 17 00:00:00 2001 From: Sergio Gomez Villamor Date: Fri, 10 May 2024 11:03:19 +0200 Subject: [PATCH 4/7] refactor: address PR review comments --- .../src/datahub/ingestion/source/aws/glue.py | 41 ++++++++++++------- .../tests/unit/test_glue_source.py | 8 +++- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index 4b81e3cf6f3c0..d6296461b6d29 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -163,6 +163,11 @@ class GlueSourceConfig( stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field( default=None, description="" ) + extract_delta_schema_from_parameters: Optional[bool] = Field( + default=False, + description="If enabled, delta schemas can be alternatively fetched from table parameters " + "(https://github.com/delta-io/delta/pull/2310)", + ) def is_profiling_enabled(self) -> bool: return self.profiling is not None and is_profiling_enabled( @@ -206,8 +211,8 @@ class GlueSourceReport(StaleEntityRemovalSourceReport): num_job_script_failed_parsing: int = 0 num_job_without_nodes: int = 0 num_dataset_to_dataset_edges_in_job: int = 0 - num_dataset_schema_invalid: int = 0 - num_dataset_buggy_delta_schema: int = 0 + num_dataset_invalid_delta_schema: int = 0 + num_dataset_valid_delta_schema: int = 0 def report_table_scanned(self) -> None: self.tables_scanned += 1 @@ -1151,15 +1156,23 @@ def get_s3_tags() -> Optional[GlobalTagsClass]: ) return new_tags - def get_schema_metadata() -> Optional[SchemaMetadata]: - def is_delta_schema(columns: Optional[List[Mapping[str, Any]]]) -> bool: - return ( - columns is not None - and (len(columns) == 1) - and (columns[0].get("Name", "") == "col") - and (columns[0].get("Type", "") == "array") - ) + def _is_delta_schema( + provider: str, num_parts: int, columns: Optional[List[Mapping[str, Any]]] + ) -> bool: + return ( + (self.source_config.extract_delta_schema_from_parameters is True) + and (provider == "delta") + and (num_parts > 0) + and (columns is not None) + and (len(columns) == 1) + and (columns[0].get("Name", "") == "col") + and (columns[0].get("Type", "") == "array") + ) + def get_schema_metadata() -> Optional[SchemaMetadata]: + # As soon as the hive integration with Spark is correctly providing the schema as expected in the + # StorageProperties, the alternative path to fetch schema from table parameters can be removed. + # https://github.com/datahub-project/datahub/pull/10299 # https://github.com/delta-io/delta/pull/2310 provider = table.get("Parameters", {}).get("spark.sql.sources.provider", "") num_parts = int( @@ -1169,7 +1182,7 @@ def is_delta_schema(columns: Optional[List[Mapping[str, Any]]]) -> bool: ) columns = table.get("StorageDescriptor", {}).get("Columns", [{}]) - if (provider == "delta") and (num_parts > 0) and is_delta_schema(columns): + if _is_delta_schema(provider, num_parts, columns): return _get_delta_schema_metadata() elif table.get("StorageDescriptor"): @@ -1179,8 +1192,6 @@ def is_delta_schema(columns: Optional[List[Mapping[str, Any]]]) -> bool: return None def _get_glue_schema_metadata() -> Optional[SchemaMetadata]: - assert table.get("StorageDescriptor") - schema = table["StorageDescriptor"]["Columns"] fields: List[SchemaField] = [] for field in schema: @@ -1240,7 +1251,7 @@ def _get_delta_schema_metadata() -> Optional[SchemaMetadata]: assert schema_fields fields.extend(schema_fields) - self.report.num_dataset_buggy_delta_schema += 1 + self.report.num_dataset_valid_delta_schema += 1 return SchemaMetadata( schemaName=table_name, version=0, @@ -1255,7 +1266,7 @@ def _get_delta_schema_metadata() -> Optional[SchemaMetadata]: dataset_urn, f"Could not parse schema for {table_name} because of {type(e).__name__}: {e}", ) - self.report.num_dataset_schema_invalid += 1 + self.report.num_dataset_invalid_delta_schema += 1 return None def get_data_platform_instance() -> DataPlatformInstanceClass: diff --git a/metadata-ingestion/tests/unit/test_glue_source.py b/metadata-ingestion/tests/unit/test_glue_source.py index 65c88064152db..5e721fc5c1293 100644 --- a/metadata-ingestion/tests/unit/test_glue_source.py +++ b/metadata-ingestion/tests/unit/test_glue_source.py @@ -65,6 +65,7 @@ def glue_source( platform_instance: Optional[str] = None, use_s3_bucket_tags: bool = True, use_s3_object_tags: bool = True, + extract_delta_schema_from_parameters: bool = False, ) -> GlueSource: return GlueSource( ctx=PipelineContext(run_id="glue-source-test"), @@ -74,6 +75,7 @@ def glue_source( platform_instance=platform_instance, use_s3_bucket_tags=use_s3_bucket_tags, use_s3_object_tags=use_s3_object_tags, + extract_delta_schema_from_parameters=extract_delta_schema_from_parameters, ), ) @@ -354,6 +356,7 @@ def test_glue_with_delta_schema_ingest( platform_instance="delta_platform_instance", use_s3_bucket_tags=False, use_s3_object_tags=False, + extract_delta_schema_from_parameters=True, ) with Stubber(glue_source_instance.glue_client) as glue_stubber: @@ -369,7 +372,7 @@ def test_glue_with_delta_schema_ingest( glue_stubber.assert_no_pending_responses() - assert glue_source_instance.get_report().num_dataset_buggy_delta_schema == 1 + assert glue_source_instance.get_report().num_dataset_valid_delta_schema == 1 write_metadata_file(tmp_path / "glue_delta_mces.json", mce_objects) @@ -390,6 +393,7 @@ def test_glue_with_malformed_delta_schema_ingest( platform_instance="delta_platform_instance", use_s3_bucket_tags=False, use_s3_object_tags=False, + extract_delta_schema_from_parameters=True, ) with Stubber(glue_source_instance.glue_client) as glue_stubber: @@ -405,7 +409,7 @@ def test_glue_with_malformed_delta_schema_ingest( glue_stubber.assert_no_pending_responses() - assert glue_source_instance.get_report().num_dataset_schema_invalid == 1 + assert glue_source_instance.get_report().num_dataset_invalid_delta_schema == 1 write_metadata_file(tmp_path / "glue_malformed_delta_mces.json", mce_objects) From 310102e7e6b9dc51cff46ac78f61c77fc6773b62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Thu, 16 May 2024 13:52:57 +0200 Subject: [PATCH 5/7] Update metadata-ingestion/src/datahub/ingestion/source/aws/glue.py Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> --- metadata-ingestion/src/datahub/ingestion/source/aws/glue.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index d6296461b6d29..90f545302c0c5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -165,8 +165,7 @@ class GlueSourceConfig( ) extract_delta_schema_from_parameters: Optional[bool] = Field( default=False, - description="If enabled, delta schemas can be alternatively fetched from table parameters " - "(https://github.com/delta-io/delta/pull/2310)", + description="If enabled, delta schemas can be alternatively fetched from table parameters." ) def is_profiling_enabled(self) -> bool: From da5c3b7f0a8c0b36ce44b715f117d345131bba09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Thu, 16 May 2024 13:53:21 +0200 Subject: [PATCH 6/7] Update metadata-ingestion/src/datahub/ingestion/source/aws/glue.py Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> --- metadata-ingestion/src/datahub/ingestion/source/aws/glue.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index 90f545302c0c5..c9e51d06ae703 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -1170,8 +1170,7 @@ def _is_delta_schema( def get_schema_metadata() -> Optional[SchemaMetadata]: # As soon as the hive integration with Spark is correctly providing the schema as expected in the - # StorageProperties, the alternative path to fetch schema from table parameters can be removed. - # https://github.com/datahub-project/datahub/pull/10299 + # StorageProperties, the alternative path to fetch schema from table parameters for delta schemas can be removed. # https://github.com/delta-io/delta/pull/2310 provider = table.get("Parameters", {}).get("spark.sql.sources.provider", "") num_parts = int( From 4bbadbdff192f267fe913152d030321bbae90243 Mon Sep 17 00:00:00 2001 From: Sergio Gomez Villamor Date: Thu, 16 May 2024 14:35:12 +0200 Subject: [PATCH 7/7] fix lint --- metadata-ingestion/src/datahub/ingestion/source/aws/glue.py | 2 +- metadata-ingestion/src/datahub/utilities/delta.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index 0d84de8dbab41..a6393aa9d0ced 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -165,7 +165,7 @@ class GlueSourceConfig( ) extract_delta_schema_from_parameters: Optional[bool] = Field( default=False, - description="If enabled, delta schemas can be alternatively fetched from table parameters." + description="If enabled, delta schemas can be alternatively fetched from table parameters.", ) def is_profiling_enabled(self) -> bool: diff --git a/metadata-ingestion/src/datahub/utilities/delta.py b/metadata-ingestion/src/datahub/utilities/delta.py index 170c0a4508a19..281eec4310c89 100644 --- a/metadata-ingestion/src/datahub/utilities/delta.py +++ b/metadata-ingestion/src/datahub/utilities/delta.py @@ -24,7 +24,7 @@ def delta_type_to_hive_type(field_type: Any) -> str: and create the native datatype """ parsed_struct += ( - "{0}:{1}".format( + "{}:{}".format( field.get("name"), delta_type_to_hive_type(field.get("type")), )