diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 22626dcf2bddfc..cb6e884d57380e 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -102,6 +102,10 @@ "acryl-sqlglot==22.3.1.dev3", } +classification_lib = { + "acryl-datahub-classify==0.0.9", +} + sql_common = ( { # Required for all SQL sources. @@ -121,6 +125,7 @@ } | usage_common | sqlglot_lib + | classification_lib ) sqllineage_lib = { @@ -190,8 +195,7 @@ "pandas", "cryptography", "msal", - "acryl-datahub-classify==0.0.9", -} +} | classification_lib trino = { "trino[sqlalchemy]>=0.308", diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py b/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py index 90847e3c456513..c6c95e76d196fc 100644 --- a/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py +++ b/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py @@ -233,6 +233,10 @@ def get_columns_to_classify( f"Skipping column {dataset_name}.{schema_field.fieldPath} from classification" ) continue + + # TODO: Let's auto-skip passing sample_data for complex(array/struct) columns + # for initial rollout + column_infos.append( ColumnInfo( metadata=Metadata( @@ -243,9 +247,11 @@ def get_columns_to_classify( "Dataset_Name": dataset_name, } ), - values=sample_data[schema_field.fieldPath] - if schema_field.fieldPath in sample_data.keys() - else [], + values=( + sample_data[schema_field.fieldPath] + if schema_field.fieldPath in sample_data.keys() + else [] + ), ) ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/data_reader.py b/metadata-ingestion/src/datahub/ingestion/source/sql/data_reader.py new file mode 100644 index 00000000000000..73730a9ea0ef73 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/data_reader.py @@ -0,0 +1,136 @@ +import logging +from abc import abstractmethod +from collections import defaultdict +from typing import Any, Dict, List, Union + +import sqlalchemy as sa +from sqlalchemy.engine import Connection, Engine +from sqlalchemy.engine.reflection import Inspector +from sqlalchemy.engine.row import LegacyRow + +from datahub.ingestion.api.closeable import Closeable + +logger: logging.Logger = logging.getLogger(__name__) + + +class DataReader(Closeable): + @abstractmethod + def get_sample_data_for_column( + self, table_id: List[str], column_name: str, sample_size: int = 100 + ) -> list: + pass + + @abstractmethod + def get_sample_data_for_table( + self, table_id: List[str], sample_size: int = 100 + ) -> Dict[str, list]: + pass + + +class SqlAlchemyTableDataReader(DataReader): + @staticmethod + def create(inspector: Inspector) -> "SqlAlchemyTableDataReader": + return SqlAlchemyTableDataReader(conn=inspector.bind) + + def __init__( + self, + conn: Union[Engine, Connection], + ) -> None: + # TODO: How can this use a connection pool instead ? + self.engine = conn.engine.connect() + + def _table(self, table_id: List[str]) -> sa.Table: + return sa.Table( + table_id[-1], + sa.MetaData(), + schema=table_id[-2] if len(table_id) > 1 else None, + ) + + def get_sample_data_for_column( + self, table_id: List[str], column_name: str, sample_size: int = 100 + ) -> list: + """ + Fetches non-null column values, upto count + Args: + table_id: Table name identifier. One of + - [, , ] or + - [, ] or + - [] + column: Column name + Returns: + list of column values + """ + + table = self._table(table_id) + query: Any + ignore_null_condition = sa.column(column_name).is_(None) + # limit doesn't compile properly for oracle so we will append rownum to query string later + if self.engine.dialect.name.lower() == "oracle": + raw_query = ( + sa.select([sa.column(column_name)]) + .select_from(table) + .where(sa.not_(ignore_null_condition)) + ) + + query = str( + raw_query.compile(self.engine, compile_kwargs={"literal_binds": True}) + ) + query += "\nAND ROWNUM <= %d" % sample_size + else: + query = ( + sa.select([sa.column(column_name)]) + .select_from(table) + .where(sa.not_(ignore_null_condition)) + .limit(sample_size) + ) + query_results = self.engine.execute(query) + + return [x[column_name] for x in query_results.fetchall()] + + def get_sample_data_for_table( + self, table_id: List[str], sample_size: int = 100 + ) -> Dict[str, list]: + """ + Fetches table values, upto *1.2 count + Args: + table_id: Table name identifier. One of + - [, , ] or + - [, ] or + - [] + Returns: + dictionary of (column name -> list of column values) + """ + column_values: Dict[str, list] = defaultdict(list) + table = self._table(table_id) + + # Ideally we do not want null values in sample data for a column. + # However that would require separate query per column and + # that would be expensiv. To compensate for possibility + # of some null values in collected sample, we fetch extra (20% more) + # rows than configured sample_size. + sample_size = int(sample_size * 1.2) + + query: Any + + # limit doesn't compile properly for oracle so we will append rownum to query string later + if self.engine.dialect.name.lower() == "oracle": + raw_query = sa.select([sa.text("*")]).select_from(table) + + query = str( + raw_query.compile(self.engine, compile_kwargs={"literal_binds": True}) + ) + query += "\nAND ROWNUM <= %d" % sample_size + else: + query = sa.select([sa.text("*")]).select_from(table).limit(sample_size) + query_results = self.engine.execute(query) + + # Not ideal - creates a parallel structure in column_values. Can we use pandas here ? + for row in query_results.fetchall(): + if isinstance(row, LegacyRow): + for col, col_value in row.items(): + column_values[col].append(col_value) + + return column_values + + def close(self) -> None: + self.engine.close() diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 52a63a6ad182ce..9ec30d57b8f762 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -1,3 +1,4 @@ +import contextlib import datetime import logging import traceback @@ -43,10 +44,18 @@ TestConnectionReport, ) from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.glossary.classification_mixin import ( + ClassificationHandler, + ClassificationReportMixin, +) from datahub.ingestion.source.common.subtypes import ( DatasetContainerSubTypes, DatasetSubTypes, ) +from datahub.ingestion.source.sql.data_reader import ( + DataReader, + SqlAlchemyTableDataReader, +) from datahub.ingestion.source.sql.sql_config import SQLCommonConfig from datahub.ingestion.source.sql.sql_utils import ( add_table_to_schema_container, @@ -120,7 +129,7 @@ @dataclass -class SQLSourceReport(StaleEntityRemovalSourceReport): +class SQLSourceReport(StaleEntityRemovalSourceReport, ClassificationReportMixin): tables_scanned: int = 0 views_scanned: int = 0 entities_profiled: int = 0 @@ -314,6 +323,7 @@ def __init__(self, config: SQLCommonConfig, ctx: PipelineContext, platform: str) self.report: SQLSourceReport = SQLSourceReport() self.profile_metadata_info: ProfileMetadata = ProfileMetadata() + self.classification_handler = ClassificationHandler(self.config, self.report) config_report = { config_option: config.dict().get(config_option) for config_option in config_options_to_report @@ -643,6 +653,20 @@ def get_foreign_key_metadata( fk_dict["name"], foreign_fields, source_fields, foreign_dataset ) + def make_data_reader(self, inspector: Inspector) -> Optional[DataReader]: + """ + Subclasses can override this with source-specific data reader + if source provides clause to pick random sample instead of current + limit-based sample + """ + if ( + self.classification_handler + and self.classification_handler.is_classification_enabled() + ): + return SqlAlchemyTableDataReader.create(inspector) + + return None + def loop_tables( # noqa: C901 self, inspector: Inspector, @@ -650,31 +674,40 @@ def loop_tables( # noqa: C901 sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: tables_seen: Set[str] = set() - try: - for table in inspector.get_table_names(schema): - dataset_name = self.get_identifier( - schema=schema, entity=table, inspector=inspector - ) - - if dataset_name not in tables_seen: - tables_seen.add(dataset_name) - else: - logger.debug(f"{dataset_name} has already been seen, skipping...") - continue - - self.report.report_entity_scanned(dataset_name, ent_type="table") - if not sql_config.table_pattern.allowed(dataset_name): - self.report.report_dropped(dataset_name) - continue - - try: - yield from self._process_table( - dataset_name, inspector, schema, table, sql_config + data_reader = self.make_data_reader(inspector) + with (data_reader or contextlib.nullcontext()): + try: + for table in inspector.get_table_names(schema): + dataset_name = self.get_identifier( + schema=schema, entity=table, inspector=inspector ) - except Exception as e: - self.warn(logger, f"{schema}.{table}", f"Ingestion error: {e}") - except Exception as e: - self.error(logger, f"{schema}", f"Tables error: {e}") + + if dataset_name not in tables_seen: + tables_seen.add(dataset_name) + else: + logger.debug( + f"{dataset_name} has already been seen, skipping..." + ) + continue + + self.report.report_entity_scanned(dataset_name, ent_type="table") + if not sql_config.table_pattern.allowed(dataset_name): + self.report.report_dropped(dataset_name) + continue + + try: + yield from self._process_table( + dataset_name, + inspector, + schema, + table, + sql_config, + data_reader, + ) + except Exception as e: + self.warn(logger, f"{schema}.{table}", f"Ingestion error: {e}") + except Exception as e: + self.error(logger, f"{schema}", f"Tables error: {e}") def add_information_for_schema(self, inspector: Inspector, schema: str) -> None: pass @@ -691,6 +724,7 @@ def _process_table( schema: str, table: str, sql_config: SQLCommonConfig, + data_reader: Optional[DataReader], ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: columns = self._get_columns(dataset_name, inspector, schema, table) dataset_urn = make_dataset_urn_with_platform_instance( @@ -740,6 +774,8 @@ def _process_table( foreign_keys, schema_fields, ) + self._classify(dataset_name, schema, table, data_reader, schema_metadata) + dataset_snapshot.aspects.append(schema_metadata) if self.config.include_view_lineage: self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata) @@ -770,6 +806,39 @@ def _process_table( domain_registry=self.domain_registry, ) + def _classify( + self, + dataset_name: str, + schema: str, + table: str, + data_reader: Optional[DataReader], + schema_metadata: SchemaMetadata, + ) -> None: + try: + if ( + self.classification_handler.is_classification_enabled_for_table( + dataset_name + ) + and data_reader + ): + self.classification_handler.classify_schema_fields( + dataset_name, + schema_metadata, + data_reader.get_sample_data_for_table( + table_id=[schema, table], + sample_size=self.config.classification.sample_size, + ), + ) + except Exception as e: + logger.debug( + f"Failed to classify table columns for {dataset_name} due to error -> {e}", + exc_info=e, + ) + self.report.report_warning( + "Failed to classify table columns", + dataset_name, + ) + def get_database_properties( self, inspector: Inspector, database: str ) -> Optional[Dict[str, str]]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py index c0dc70301ba341..d7049fe12cdb05 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py @@ -12,6 +12,9 @@ LowerCaseDatasetUrnConfigMixin, ) from datahub.configuration.validate_field_removal import pydantic_removed_field +from datahub.ingestion.glossary.classification_mixin import ( + ClassificationSourceConfigMixin, +) from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig from datahub.ingestion.source.state.stale_entity_removal_handler import ( StatefulStaleMetadataRemovalConfig, @@ -29,6 +32,7 @@ class SQLCommonConfig( DatasetSourceConfigMixin, LowerCaseDatasetUrnConfigMixin, LineageConfig, + ClassificationSourceConfigMixin, ): options: dict = pydantic.Field( default_factory=dict, diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py b/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py index cf199237e3041c..7668cb01f84bc8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py @@ -35,6 +35,7 @@ ) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.extractor import schema_util +from datahub.ingestion.source.sql.data_reader import DataReader from datahub.ingestion.source.sql.sql_common import ( SQLAlchemySource, SqlWorkUnit, @@ -334,9 +335,10 @@ def _process_table( schema: str, table: str, sql_config: SQLCommonConfig, + data_reader: Optional[DataReader], ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: yield from super()._process_table( - dataset_name, inspector, schema, table, sql_config + dataset_name, inspector, schema, table, sql_config, data_reader ) if self.config.ingest_lineage_to_connectors: dataset_urn = make_dataset_urn_with_platform_instance( diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py b/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py index b89db755853bc3..32f1ba5b8d5635 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py @@ -24,6 +24,7 @@ support_status, ) from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.sql.data_reader import DataReader from datahub.ingestion.source.sql.sql_common import ( SQLAlchemySource, SQLSourceReport, @@ -221,6 +222,7 @@ def _process_table( schema: str, table: str, sql_config: SQLCommonConfig, + data_reader: Optional[DataReader], ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: dataset_urn = make_dataset_urn_with_platform_instance( self.platform, @@ -235,7 +237,7 @@ def _process_table( owner_urn=f"urn:li:corpuser:{table_owner}", ) yield from super()._process_table( - dataset_name, inspector, schema, table, sql_config + dataset_name, inspector, schema, table, sql_config, data_reader ) def loop_views( diff --git a/metadata-ingestion/tests/integration/mysql/mysql_mces_no_db_golden.json b/metadata-ingestion/tests/integration/mysql/mysql_mces_no_db_golden.json index cb22a6cb0a346c..95a6e5791a884f 100644 --- a/metadata-ingestion/tests/integration/mysql/mysql_mces_no_db_golden.json +++ b/metadata-ingestion/tests/integration/mysql/mysql_mces_no_db_golden.json @@ -197,6 +197,17 @@ }, "nativeDataType": "ENUM('M', 'F')", "recursive": false, + "glossaryTerms": { + "terms": [ + { + "urn": "urn:li:glossaryTerm:Gender" + } + ], + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + } + }, "isPartOfKey": false }, { @@ -1897,6 +1908,17 @@ }, "nativeDataType": "VARCHAR(length=50)", "recursive": false, + "glossaryTerms": { + "terms": [ + { + "urn": "urn:li:glossaryTerm:Email_Address" + } + ], + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + } + }, "isPartOfKey": false }, { @@ -2192,10 +2214,17 @@ }, { "fieldPath": "email_address", - "uniqueCount": 0, - "nullCount": 5, - "nullProportion": 1, - "sampleValues": [] + "uniqueCount": 5, + "uniqueProportion": 1, + "nullCount": 0, + "nullProportion": 0.0, + "sampleValues": [ + "Bedecs@xyz.com", + "Gratacos@xyz.com", + "Axen@xyz.com", + "Lee@xyz.com", + "Donnell@xyz.com" + ] }, { "fieldPath": "priority", @@ -2728,258 +2757,5 @@ "runId": "mysql-test", "lastRunId": "no-run-id-provided" } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:0f72a1bc79da282eb614cc089c0ba302", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "mysql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,dataCharmer.employees,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:0f72a1bc79da282eb614cc089c0ba302", - "urn": "urn:li:container:0f72a1bc79da282eb614cc089c0ba302" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "mysql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,dataCharmer.salaries,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:0f72a1bc79da282eb614cc089c0ba302", - "urn": "urn:li:container:0f72a1bc79da282eb614cc089c0ba302" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "mysql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:17751259af32dd0385cad799df608c40", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "mysql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_aspect,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:17751259af32dd0385cad799df608c40", - "urn": "urn:li:container:17751259af32dd0385cad799df608c40" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "mysql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:17751259af32dd0385cad799df608c40", - "urn": "urn:li:container:17751259af32dd0385cad799df608c40" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "mysql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:17751259af32dd0385cad799df608c40", - "urn": "urn:li:container:17751259af32dd0385cad799df608c40" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "mysql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "mysql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f", - "urn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "mysql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f", - "urn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "mysql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:28176129fe1c0e526e1803250ec124ef", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "mysql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,test_cases.myset,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:28176129fe1c0e526e1803250ec124ef", - "urn": "urn:li:container:28176129fe1c0e526e1803250ec124ef" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "mysql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,test_cases.test_empty,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:28176129fe1c0e526e1803250ec124ef", - "urn": "urn:li:container:28176129fe1c0e526e1803250ec124ef" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "mysql-test", - "lastRunId": "no-run-id-provided" - } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/mysql/mysql_mces_with_db_golden.json b/metadata-ingestion/tests/integration/mysql/mysql_mces_with_db_golden.json index f24220b4dbf596..065d6cbe90b313 100644 --- a/metadata-ingestion/tests/integration/mysql/mysql_mces_with_db_golden.json +++ b/metadata-ingestion/tests/integration/mysql/mysql_mces_with_db_golden.json @@ -546,10 +546,17 @@ }, { "fieldPath": "email_address", - "uniqueCount": 0, - "nullCount": 5, - "nullProportion": 1, - "sampleValues": [] + "uniqueCount": 5, + "uniqueProportion": 1, + "nullCount": 0, + "nullProportion": 0.0, + "sampleValues": [ + "Bedecs@xyz.com", + "Gratacos@xyz.com", + "Axen@xyz.com", + "Lee@xyz.com", + "Donnell@xyz.com" + ] }, { "fieldPath": "priority", @@ -632,63 +639,5 @@ "runId": "mysql-test", "lastRunId": "no-run-id-provided" } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "mysql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f", - "urn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "mysql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f", - "urn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "mysql-test", - "lastRunId": "no-run-id-provided" - } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/mysql/mysql_table_row_count_estimate_only.json b/metadata-ingestion/tests/integration/mysql/mysql_table_row_count_estimate_only.json index 34b18089aeebf4..fc25af0400bb5d 100644 --- a/metadata-ingestion/tests/integration/mysql/mysql_table_row_count_estimate_only.json +++ b/metadata-ingestion/tests/integration/mysql/mysql_table_row_count_estimate_only.json @@ -455,7 +455,8 @@ }, { "fieldPath": "email_address", - "uniqueCount": 0, + "uniqueCount": 5, + "uniqueProportion": 1, "nullCount": 0 }, { @@ -513,63 +514,5 @@ "runId": "mysql-2020_04_14-07_00_00", "lastRunId": "no-run-id-provided" } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "mysql-2020_04_14-07_00_00", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f", - "urn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "mysql-2020_04_14-07_00_00", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f", - "urn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "mysql-2020_04_14-07_00_00", - "lastRunId": "no-run-id-provided" - } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/mysql/mysql_to_file_dbalias.yml b/metadata-ingestion/tests/integration/mysql/mysql_to_file_dbalias.yml deleted file mode 100644 index 89b87505ab527e..00000000000000 --- a/metadata-ingestion/tests/integration/mysql/mysql_to_file_dbalias.yml +++ /dev/null @@ -1,40 +0,0 @@ -run_id: mysql-test - -source: - type: mysql - config: - username: root - password: example - database: metagalaxy - host_port: localhost:53307 - schema_pattern: - allow: - - "^metagalaxy" - - "^northwind" - - "^datacharmer" - - "^test_cases" - profile_pattern: - allow: - - "^northwind" - - "^datacharmer" - - "^test_cases" - profiling: - enabled: True - include_field_null_count: true - include_field_min_value: true - include_field_max_value: true - include_field_mean_value: true - include_field_median_value: true - include_field_stddev_value: true - include_field_quantiles: true - include_field_distinct_value_frequencies: true - include_field_histogram: true - include_field_sample_values: true - domain: - "urn:li:domain:sales": - allow: - - "^metagalaxy" -sink: - type: file - config: - filename: "./mysql_mces_dbalias.json" diff --git a/metadata-ingestion/tests/integration/mysql/mysql_to_file_no_db.yml b/metadata-ingestion/tests/integration/mysql/mysql_to_file_no_db.yml index ee355e4f02b79b..f6acb0dfb408f6 100644 --- a/metadata-ingestion/tests/integration/mysql/mysql_to_file_no_db.yml +++ b/metadata-ingestion/tests/integration/mysql/mysql_to_file_no_db.yml @@ -34,6 +34,19 @@ source: "urn:li:domain:sales": allow: - "^metagalaxy" + classification: + enabled: True + classifiers: + - type: datahub + config: + minimum_values_threshold: 1 + info_types_config: + Full_Name: + prediction_factors_and_weights: + name: 0.5 + description: 0 + datatype: 0 + values: 0.5 sink: type: file config: diff --git a/metadata-ingestion/tests/integration/mysql/setup/setup.sql b/metadata-ingestion/tests/integration/mysql/setup/setup.sql index c8a88aff0f2533..9b8c48bc744f7a 100644 --- a/metadata-ingestion/tests/integration/mysql/setup/setup.sql +++ b/metadata-ingestion/tests/integration/mysql/setup/setup.sql @@ -249,11 +249,11 @@ USE `northwind`; # Dumping data for table 'customers' # -INSERT INTO `customers` (`id`, `company`, `last_name`, `first_name`, `email_address`, `priority`) VALUES (1, 'Company A', 'Bedecs', 'Anna', NULL, 4); -INSERT INTO `customers` (`id`, `company`, `last_name`, `first_name`, `email_address`, `priority`) VALUES (2, 'Company B', 'Gratacos Solsona', 'Antonio', NULL, 4.9); -INSERT INTO `customers` (`id`, `company`, `last_name`, `first_name`, `email_address`, `priority`) VALUES (3, 'Company C', 'Axen', 'Thomas', NULL, 4); -INSERT INTO `customers` (`id`, `company`, `last_name`, `first_name`, `email_address`, `priority`) VALUES (4, 'Company D', 'Lee', 'Christina', NULL, 3.8); -INSERT INTO `customers` (`id`, `company`, `last_name`, `first_name`, `email_address`, `priority`) VALUES (5, 'Company E', 'Donnell', 'Martin', NULL, NULL); +INSERT INTO `customers` (`id`, `company`, `last_name`, `first_name`, `email_address`, `priority`) VALUES (1, 'Company A', 'Bedecs', 'Anna', 'Bedecs@xyz.com', 4); +INSERT INTO `customers` (`id`, `company`, `last_name`, `first_name`, `email_address`, `priority`) VALUES (2, 'Company B', 'Gratacos Solsona', 'Antonio', 'Gratacos@xyz.com', 4.9); +INSERT INTO `customers` (`id`, `company`, `last_name`, `first_name`, `email_address`, `priority`) VALUES (3, 'Company C', 'Axen', 'Thomas', 'Axen@xyz.com', 4); +INSERT INTO `customers` (`id`, `company`, `last_name`, `first_name`, `email_address`, `priority`) VALUES (4, 'Company D', 'Lee', 'Christina', 'Lee@xyz.com', 3.8); +INSERT INTO `customers` (`id`, `company`, `last_name`, `first_name`, `email_address`, `priority`) VALUES (5, 'Company E', 'Donnell', 'Martin', 'Donnell@xyz.com', NULL); # 5 records -- ----------------------------------------------------- diff --git a/metadata-ingestion/tests/integration/postgres/postgres_mces_with_db_golden.json b/metadata-ingestion/tests/integration/postgres/postgres_mces_with_db_golden.json index f6fa0a0ed032ef..eda8ffbac1618f 100644 --- a/metadata-ingestion/tests/integration/postgres/postgres_mces_with_db_golden.json +++ b/metadata-ingestion/tests/integration/postgres/postgres_mces_with_db_golden.json @@ -258,6 +258,17 @@ }, "nativeDataType": "VARCHAR(length=500)", "recursive": false, + "glossaryTerms": { + "terms": [ + { + "urn": "urn:li:glossaryTerm:URN" + } + ], + "auditStamp": { + "time": 1646575200000, + "actor": "urn:li:corpuser:datahub" + } + }, "isPartOfKey": true }, { @@ -330,6 +341,17 @@ }, "nativeDataType": "VARCHAR(length=255)", "recursive": false, + "glossaryTerms": { + "terms": [ + { + "urn": "urn:li:glossaryTerm:URN" + } + ], + "auditStamp": { + "time": 1646575200000, + "actor": "urn:li:corpuser:datahub" + } + }, "isPartOfKey": false }, { diff --git a/metadata-ingestion/tests/integration/postgres/postgres_to_file_with_db_estimate_row_count.yml b/metadata-ingestion/tests/integration/postgres/postgres_to_file_with_db_estimate_row_count.yml index 4a2cc543f2d011..aaa5c2438257b5 100644 --- a/metadata-ingestion/tests/integration/postgres/postgres_to_file_with_db_estimate_row_count.yml +++ b/metadata-ingestion/tests/integration/postgres/postgres_to_file_with_db_estimate_row_count.yml @@ -14,6 +14,24 @@ source: turn_off_expensive_profiling_metrics: true catch_exceptions: true include_views: true + classification: + enabled: True + classifiers: + - type: datahub + config: + minimum_values_threshold: 1 + info_types_config: + URN: + prediction_factors_and_weights: + name: 0 + description: 0 + datatype: 0 + values: 1 + values: + prediction_type: regex + regex: + - "^urn:li:.*:.*" + library: [] sink: type: file config: diff --git a/metadata-ingestion/tests/integration/trino/setup/hive_setup.sql b/metadata-ingestion/tests/integration/trino/setup/hive_setup.sql index 4618378cb26b63..0c24934c92736a 100644 --- a/metadata-ingestion/tests/integration/trino/setup/hive_setup.sql +++ b/metadata-ingestion/tests/integration/trino/setup/hive_setup.sql @@ -56,4 +56,13 @@ CREATE TABLE db1.union_test( foo UNIONTYPE, struct> ) STORED AS ORC ; -CREATE TABLE db1.map_test(KeyValue String, RecordId map); \ No newline at end of file +CREATE TABLE db1.map_test(KeyValue String, RecordId map); + +CREATE TABLE db1.classification_test(id STRING, name STRING, email STRING, gender STRING, age INT); + +INSERT INTO + db1.classification_test +VALUES + ("1", "Foo Bar", "foo@bar.com", "M", 21), + ("2", "John Doe", "john.doe@example.com", "M", 30), + ("3", "Jane Doe", "jane.doe@abc.com", "F", 27); \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/trino/test_trino.py b/metadata-ingestion/tests/integration/trino/test_trino.py index 4e24b5c8871943..6437666fed62b5 100644 --- a/metadata-ingestion/tests/integration/trino/test_trino.py +++ b/metadata-ingestion/tests/integration/trino/test_trino.py @@ -5,6 +5,11 @@ from freezegun import freeze_time from datahub.configuration.common import AllowDenyPattern +from datahub.ingestion.glossary.classifier import ( + ClassificationConfig, + DynamicTypedClassifierConfig, +) +from datahub.ingestion.glossary.datahub_classifier import DataHubClassifierConfig from datahub.ingestion.run.pipeline import Pipeline from datahub.ingestion.sink.file import FileSinkConfig from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig @@ -87,6 +92,18 @@ def test_trino_ingest( include_field_histogram=True, include_field_sample_values=True, ), + classification=ClassificationConfig( + enabled=True, + classifiers=[ + DynamicTypedClassifierConfig( + type="datahub", + config=DataHubClassifierConfig( + minimum_values_threshold=1, + ), + ) + ], + max_workers=1, + ), catalog_to_connector_details={ "postgresqldb": ConnectorDetail( connector_database="postgres", @@ -131,6 +148,18 @@ def test_trino_hive_ingest( database="hivedb", username="foo", schema_pattern=AllowDenyPattern(allow=["^db1"]), + classification=ClassificationConfig( + enabled=True, + classifiers=[ + DynamicTypedClassifierConfig( + type="datahub", + config=DataHubClassifierConfig( + minimum_values_threshold=1, + ), + ) + ], + max_workers=1, + ), ).dict(), }, "sink": { diff --git a/metadata-ingestion/tests/integration/trino/trino_hive_instance_mces_golden.json b/metadata-ingestion/tests/integration/trino/trino_hive_instance_mces_golden.json index d63995506cb9c3..c5664b9373e8c5 100644 --- a/metadata-ingestion/tests/integration/trino/trino_hive_instance_mces_golden.json +++ b/metadata-ingestion/tests/integration/trino/trino_hive_instance_mces_golden.json @@ -244,7 +244,7 @@ "numrows": "1", "rawdatasize": "32", "totalsize": "33", - "transient_lastddltime": "1708925463" + "transient_lastddltime": "1710150034" }, "name": "array_struct_test", "description": "This table has array of structs", @@ -471,6 +471,265 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.classification_test,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:46baa6eebd802861e5ee3d043456e171" + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-instance-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.classification_test,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "column_stats_accurate": "{\"BASIC_STATS\":\"true\"}", + "numfiles": "1", + "numrows": "3", + "rawdatasize": "94", + "totalsize": "97", + "transient_lastddltime": "1710150038" + }, + "name": "classification_test", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "hivedb.db1.classification_test", + "platform": "urn:li:dataPlatform:trino", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "email", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "gender", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "age", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-instance-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.classification_test,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:trino", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)" + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-instance-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.classification_test,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-instance-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.classification_test,PROD)", + "changeType": "UPSERT", + "aspectName": "siblings", + "aspect": { + "json": { + "siblings": [ + "urn:li:dataset:(urn:li:dataPlatform:glue,local_server.db1.classification_test,PROD)" + ], + "primary": true + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-instance-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.classification_test,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)" + }, + { + "id": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b", + "urn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b" + }, + { + "id": "urn:li:container:46baa6eebd802861e5ee3d043456e171", + "urn": "urn:li:container:46baa6eebd802861e5ee3d043456e171" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-instance-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,local_server.db1.classification_test,PROD)", + "changeType": "UPSERT", + "aspectName": "siblings", + "aspect": { + "json": { + "siblings": [ + "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.classification_test,PROD)" + ], + "primary": false + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-instance-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.classification_test,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:glue,local_server.db1.classification_test,PROD)", + "type": "VIEW" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-instance-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.map_test,PROD)", @@ -505,7 +764,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1708925466" + "transient_lastddltime": "1710150036" }, "name": "map_test", "tags": [] @@ -732,7 +991,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1708925466" + "transient_lastddltime": "1710150036" }, "name": "nested_struct_test", "tags": [] @@ -1003,7 +1262,7 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "transient_lastddltime": "1708925457" + "transient_lastddltime": "1710150028" }, "name": "pokes", "tags": [] @@ -1238,7 +1497,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1708925459" + "transient_lastddltime": "1710150031" }, "name": "struct_test", "tags": [] @@ -1489,7 +1748,7 @@ "customProperties": { "numfiles": "0", "totalsize": "0", - "transient_lastddltime": "1708925466" + "transient_lastddltime": "1710150036" }, "name": "struct_test_view_materialized", "tags": [] @@ -1743,7 +2002,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1708925459" + "transient_lastddltime": "1710150031" }, "name": "_test_table_underscore", "tags": [] @@ -1966,7 +2225,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1708925466" + "transient_lastddltime": "1710150036" }, "name": "union_test", "tags": [] @@ -2268,7 +2527,7 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "transient_lastddltime": "1708925466", + "transient_lastddltime": "1710150036", "view_definition": "SELECT \"property_id\", \"service\"\nFROM \"db1\".\"array_struct_test\"", "is_view": "True" }, @@ -2586,7 +2845,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,local_server.db1.map_test,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,local_server.db1.classification_test,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2602,7 +2861,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,local_server.db1.nested_struct_test,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,local_server.db1.map_test,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2618,7 +2877,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,local_server.db1.pokes,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,local_server.db1.nested_struct_test,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2634,7 +2893,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,local_server.db1.struct_test,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,local_server.db1.pokes,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2650,7 +2909,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,local_server.db1.struct_test_view_materialized,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,local_server.db1.struct_test,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2666,7 +2925,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,local_server.db1.union_test,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,local_server.db1.struct_test_view_materialized,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2681,304 +2940,13 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,local_server.db1.union_test,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "status", "aspect": { "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-instance-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:46baa6eebd802861e5ee3d043456e171", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)" - }, - { - "id": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b", - "urn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-instance-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.array_struct_test,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)" - }, - { - "id": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b", - "urn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b" - }, - { - "id": "urn:li:container:46baa6eebd802861e5ee3d043456e171", - "urn": "urn:li:container:46baa6eebd802861e5ee3d043456e171" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-instance-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.map_test,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)" - }, - { - "id": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b", - "urn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b" - }, - { - "id": "urn:li:container:46baa6eebd802861e5ee3d043456e171", - "urn": "urn:li:container:46baa6eebd802861e5ee3d043456e171" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-instance-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.nested_struct_test,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)" - }, - { - "id": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b", - "urn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b" - }, - { - "id": "urn:li:container:46baa6eebd802861e5ee3d043456e171", - "urn": "urn:li:container:46baa6eebd802861e5ee3d043456e171" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-instance-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.pokes,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)" - }, - { - "id": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b", - "urn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b" - }, - { - "id": "urn:li:container:46baa6eebd802861e5ee3d043456e171", - "urn": "urn:li:container:46baa6eebd802861e5ee3d043456e171" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-instance-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.struct_test,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)" - }, - { - "id": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b", - "urn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b" - }, - { - "id": "urn:li:container:46baa6eebd802861e5ee3d043456e171", - "urn": "urn:li:container:46baa6eebd802861e5ee3d043456e171" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-instance-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.struct_test_view_materialized,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)" - }, - { - "id": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b", - "urn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b" - }, - { - "id": "urn:li:container:46baa6eebd802861e5ee3d043456e171", - "urn": "urn:li:container:46baa6eebd802861e5ee3d043456e171" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-instance-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1._test_table_underscore,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)" - }, - { - "id": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b", - "urn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b" - }, - { - "id": "urn:li:container:46baa6eebd802861e5ee3d043456e171", - "urn": "urn:li:container:46baa6eebd802861e5ee3d043456e171" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-instance-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.union_test,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)" - }, - { - "id": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b", - "urn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b" - }, - { - "id": "urn:li:container:46baa6eebd802861e5ee3d043456e171", - "urn": "urn:li:container:46baa6eebd802861e5ee3d043456e171" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-instance-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.array_struct_test_view,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)" - }, - { - "id": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b", - "urn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b" - }, - { - "id": "urn:li:container:46baa6eebd802861e5ee3d043456e171", - "urn": "urn:li:container:46baa6eebd802861e5ee3d043456e171" - } - ] + "removed": false } }, "systemMetadata": { diff --git a/metadata-ingestion/tests/integration/trino/trino_hive_mces_golden.json b/metadata-ingestion/tests/integration/trino/trino_hive_mces_golden.json index 3e79c8721486e2..18921c93505876 100644 --- a/metadata-ingestion/tests/integration/trino/trino_hive_mces_golden.json +++ b/metadata-ingestion/tests/integration/trino/trino_hive_mces_golden.json @@ -231,7 +231,7 @@ "numrows": "1", "rawdatasize": "32", "totalsize": "33", - "transient_lastddltime": "1708925463" + "transient_lastddltime": "1710149909" }, "name": "array_struct_test", "description": "This table has array of structs", @@ -437,6 +437,288 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.classification_test,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84" + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.classification_test,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "column_stats_accurate": "{\"BASIC_STATS\":\"true\"}", + "numfiles": "1", + "numrows": "3", + "rawdatasize": "94", + "totalsize": "97", + "transient_lastddltime": "1710149912" + }, + "name": "classification_test", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "hivedb.db1.classification_test", + "platform": "urn:li:dataPlatform:trino", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR()", + "recursive": false, + "glossaryTerms": { + "terms": [ + { + "urn": "urn:li:glossaryTerm:Full_Name" + } + ], + "auditStamp": { + "time": 1632398400000, + "actor": "urn:li:corpuser:datahub" + } + }, + "isPartOfKey": false + }, + { + "fieldPath": "email", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR()", + "recursive": false, + "glossaryTerms": { + "terms": [ + { + "urn": "urn:li:glossaryTerm:Email_Address" + } + ], + "auditStamp": { + "time": 1632398400000, + "actor": "urn:li:corpuser:datahub" + } + }, + "isPartOfKey": false + }, + { + "fieldPath": "gender", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR()", + "recursive": false, + "glossaryTerms": { + "terms": [ + { + "urn": "urn:li:glossaryTerm:Gender" + } + ], + "auditStamp": { + "time": 1632398400000, + "actor": "urn:li:corpuser:datahub" + } + }, + "isPartOfKey": false + }, + { + "fieldPath": "age", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "glossaryTerms": { + "terms": [ + { + "urn": "urn:li:glossaryTerm:Age" + } + ], + "auditStamp": { + "time": 1632398400000, + "actor": "urn:li:corpuser:datahub" + } + }, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.classification_test,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.classification_test,PROD)", + "changeType": "UPSERT", + "aspectName": "siblings", + "aspect": { + "json": { + "siblings": [ + "urn:li:dataset:(urn:li:dataPlatform:hive,db1.classification_test,PROD)" + ], + "primary": true + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.classification_test,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7", + "urn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7" + }, + { + "id": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84", + "urn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.classification_test,PROD)", + "changeType": "UPSERT", + "aspectName": "siblings", + "aspect": { + "json": { + "siblings": [ + "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.classification_test,PROD)" + ], + "primary": false + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.classification_test,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.classification_test,PROD)", + "type": "VIEW" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.map_test,PROD)", @@ -471,7 +753,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1708925466" + "transient_lastddltime": "1710149911" }, "name": "map_test", "tags": [] @@ -677,7 +959,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1708925466" + "transient_lastddltime": "1710149911" }, "name": "nested_struct_test", "tags": [] @@ -927,7 +1209,7 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "transient_lastddltime": "1708925457" + "transient_lastddltime": "1710149904" }, "name": "pokes", "tags": [] @@ -1141,7 +1423,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1708925459" + "transient_lastddltime": "1710149906" }, "name": "struct_test", "tags": [] @@ -1371,7 +1653,7 @@ "customProperties": { "numfiles": "0", "totalsize": "0", - "transient_lastddltime": "1708925466" + "transient_lastddltime": "1710149911" }, "name": "struct_test_view_materialized", "tags": [] @@ -1604,7 +1886,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1708925459" + "transient_lastddltime": "1710149906" }, "name": "_test_table_underscore", "tags": [] @@ -1806,7 +2088,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1708925466" + "transient_lastddltime": "1710149911" }, "name": "union_test", "tags": [] @@ -2087,7 +2369,7 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "transient_lastddltime": "1708925466", + "transient_lastddltime": "1710149911", "view_definition": "SELECT \"property_id\", \"service\"\nFROM \"db1\".\"array_struct_test\"", "is_view": "True" }, @@ -2384,7 +2666,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.map_test,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.classification_test,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2400,7 +2682,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.nested_struct_test,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.map_test,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2416,7 +2698,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.pokes,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.nested_struct_test,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2432,7 +2714,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.pokes,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2448,7 +2730,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test_view_materialized,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2464,7 +2746,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.union_test,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test_view_materialized,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2478,260 +2760,14 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "container", - "entityUrn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [] - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7", - "urn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7", - "urn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7" - }, - { - "id": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84", - "urn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.map_test,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7", - "urn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7" - }, - { - "id": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84", - "urn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.nested_struct_test,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7", - "urn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7" - }, - { - "id": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84", - "urn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.pokes,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7", - "urn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7" - }, - { - "id": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84", - "urn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.struct_test,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7", - "urn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7" - }, - { - "id": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84", - "urn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.struct_test_view_materialized,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7", - "urn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7" - }, - { - "id": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84", - "urn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1._test_table_underscore,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7", - "urn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7" - }, - { - "id": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84", - "urn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.union_test,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7", - "urn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7" - }, - { - "id": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84", - "urn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.union_test,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "status", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7", - "urn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7" - }, - { - "id": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84", - "urn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84" - } - ] + "removed": false } }, "systemMetadata": {