From 0ce4d1e9427fe7cb972cd278a2e18c5c7a4ff7d2 Mon Sep 17 00:00:00 2001 From: Mayuri N Date: Fri, 27 Oct 2023 12:46:16 +0530 Subject: [PATCH 1/2] fix(ingest/bigquery): use correct row count in null count profiling computation --- .../src/datahub/ingestion/source/ge_data_profiler.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py index 9f6ac9dd211642..46cc4c8b26c31f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py @@ -781,7 +781,7 @@ def update_dataset_batch_use_sampling(self, profile: DatasetProfileClass) -> Non sample_pc = 100 * self.config.sample_size / profile.rowCount sql = ( f"SELECT * FROM {str(self.dataset._table)} " - + f"TABLESAMPLE SYSTEM ({sample_pc:.3f} percent)" + + f"TABLESAMPLE SYSTEM ({sample_pc:.8f} percent)" ) temp_table_name = create_bigquery_temp_table( self, @@ -793,6 +793,13 @@ def update_dataset_batch_use_sampling(self, profile: DatasetProfileClass) -> Non self.dataset._table = sa.text(temp_table_name) logger.debug(f"Setting table name to be {self.dataset._table}") + # We can alternatively use `self._get_dataset_rows(profile)` to get + # exact count of rows in sample, as actual rows involved in sample + # may be slightly different (more or less) than configured `sample_size`. + # However not doing so to start with, as that adds another query overhead + # plus approximate metrics should work for sampling based profiling. + profile.rowCount = self.config.sample_size + if ( profile.partitionSpec and profile.partitionSpec.type == PartitionTypeClass.FULL_TABLE From f3cd6d4429c2d00c511b7c3c66f74fe6c7f2b071 Mon Sep 17 00:00:00 2001 From: Mayuri N Date: Mon, 30 Oct 2023 11:31:43 +0530 Subject: [PATCH 2/2] update only local row count assignment --- .../ingestion/source/ge_data_profiler.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py index 46cc4c8b26c31f..4998a96d8e5f03 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py @@ -629,7 +629,16 @@ def generate_dataset_profile( # noqa: C901 (complexity) self.query_combiner.flush() assert profile.rowCount is not None - row_count: int = profile.rowCount + row_count: int # used for null counts calculation + if profile.partitionSpec and "SAMPLE" in profile.partitionSpec.partition: + # We can alternatively use `self._get_dataset_rows(profile)` to get + # exact count of rows in sample, as actual rows involved in sample + # may be slightly different (more or less) than configured `sample_size`. + # However not doing so to start with, as that adds another query overhead + # plus approximate metrics should work for sampling based profiling. + row_count = self.config.sample_size + else: + row_count = profile.rowCount for column_spec in columns_profiling_queue: column = column_spec.column @@ -793,13 +802,6 @@ def update_dataset_batch_use_sampling(self, profile: DatasetProfileClass) -> Non self.dataset._table = sa.text(temp_table_name) logger.debug(f"Setting table name to be {self.dataset._table}") - # We can alternatively use `self._get_dataset_rows(profile)` to get - # exact count of rows in sample, as actual rows involved in sample - # may be slightly different (more or less) than configured `sample_size`. - # However not doing so to start with, as that adds another query overhead - # plus approximate metrics should work for sampling based profiling. - profile.rowCount = self.config.sample_size - if ( profile.partitionSpec and profile.partitionSpec.type == PartitionTypeClass.FULL_TABLE