From df9755c9483d9d46603c82b122bbece71dad89be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Wed, 4 Dec 2024 10:06:25 +0100 Subject: [PATCH] feat(tableau): review reporting and debug traces (#12015) Co-authored-by: Harshal Sheth --- .../ingestion/source/tableau/tableau.py | 46 +++++++++++-------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 0eafdb4ad23ba0..f3ad5ea706f7ca 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -289,16 +289,12 @@ def make_tableau_client(self, site: str) -> Server: server.auth.sign_in(authentication) return server except ServerResponseError as e: + message = f"Unable to login (invalid/expired credentials or missing permissions): {str(e)}" if isinstance(authentication, PersonalAccessTokenAuth): # Docs on token expiry in Tableau: # https://help.tableau.com/current/server/en-us/security_personal_access_tokens.htm#token-expiry - logger.info( - "Error authenticating with Tableau. Note that Tableau personal access tokens " - "expire if not used for 15 days or if over 1 year old" - ) - raise ValueError( - f"Unable to login (invalid/expired credentials or missing permissions): {str(e)}" - ) from e + message = f"Error authenticating with Tableau. Note that Tableau personal access tokens expire if not used for 15 days or if over 1 year old: {str(e)}" + raise ValueError(message) from e except Exception as e: raise ValueError( f"Unable to login (check your Tableau connection and credentials): {str(e)}" @@ -722,6 +718,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: title="Failed to Retrieve Tableau Metadata", message="Unable to retrieve metadata from tableau.", context=str(md_exception), + exc=md_exception, ) def close(self) -> None: @@ -826,6 +823,7 @@ def _populate_usage_stat_registry(self) -> None: if not view.id: continue self.tableau_stat_registry[view.id] = UsageStat(view_count=view.total_views) + logger.info(f"Got Tableau stats for {len(self.tableau_stat_registry)} assets") logger.debug("Tableau stats %s", self.tableau_stat_registry) def _populate_database_server_hostname_map(self) -> None: @@ -876,7 +874,7 @@ def form_path(project_id: str) -> List[str]: ancestors = [cur_proj.name] while cur_proj.parent_id is not None: if cur_proj.parent_id not in all_project_map: - self.report.report_warning( + self.report.warning( "project-issue", f"Parent project {cur_proj.parent_id} not found. We need Site Administrator Explorer permissions.", ) @@ -974,8 +972,11 @@ def _init_datasource_registry(self) -> None: self.datasource_project_map[ds.id] = ds.project_id except Exception as e: self.report.get_all_datasources_query_failed = True - logger.info(f"Get all datasources query failed due to error {e}") - logger.debug("Error stack trace", exc_info=True) + self.report.warning( + title="Unexpected Query Error", + message="Get all datasources query failed due to error", + exc=e, + ) def _init_workbook_registry(self) -> None: if self.server is None: @@ -1141,7 +1142,6 @@ def get_connection_object_page( ) if node_limit_errors: - logger.debug(f"Node Limit Error. query_data {query_data}") self.report.warning( title="Tableau Data Exceed Predefined Limit", message="The numbers of record in result set exceeds a predefined limit. Increase the tableau " @@ -1257,9 +1257,10 @@ def emit_workbooks(self) -> Iterable[MetadataWorkUnit]: wrk_id: Optional[str] = workbook.get(c.ID) prj_name: Optional[str] = workbook.get(c.PROJECT_NAME) - logger.debug( - f"Skipping workbook {wrk_name}({wrk_id}) as it is project {prj_name}({project_luid}) not " - f"present in project registry" + self.report.warning( + title="Skipping Missing Workbook", + message="Skipping workbook as its project is not present in project registry", + context=f"workbook={wrk_name}({wrk_id}), project={prj_name}({project_luid})", ) continue @@ -1453,7 +1454,7 @@ def get_upstream_tables( c.COLUMNS_CONNECTION ].get("totalCount") if not is_custom_sql and not num_tbl_cols: - logger.debug( + logger.warning( f"Skipping upstream table with id {table[c.ID]}, no columns: {table}" ) continue @@ -1469,7 +1470,12 @@ def get_upstream_tables( table, default_schema_map=self.config.default_schema_map ) except Exception as e: - logger.info(f"Failed to generate upstream reference for {table}: {e}") + self.report.warning( + title="Potentially Missing Lineage Issue", + message="Failed to generate upstream reference", + exc=e, + context=f"table={table}", + ) continue table_urn = ref.make_dataset_urn( @@ -1917,10 +1923,12 @@ def _query_published_datasource_for_project_luid(self, ds_luid: str) -> None: self.datasource_project_map[ds_result.id] = ds_result.project_id except Exception as e: self.report.num_get_datasource_query_failures += 1 - logger.warning( - f"Failed to get datasource project_luid for {ds_luid} due to error {e}" + self.report.warning( + title="Unexpected Query Error", + message="Failed to get datasource details", + exc=e, + context=f"ds_luid={ds_luid}", ) - logger.debug("Error stack trace", exc_info=True) def _get_workbook_project_luid(self, wb: dict) -> Optional[str]: if wb.get(c.LUID) and self.workbook_project_map.get(wb[c.LUID]):