Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(ingest/powerbi): powerbi dataset profiling #9355

Merged
merged 18 commits into from
Jun 28, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions metadata-ingestion/docs/sources/powerbi/powerbi_pre.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,10 @@ By default, extracting endorsement information to tags is disabled. The feature

Please note that the default implementation overwrites tags for the ingested entities, if you need to preserve existing tags, consider using a [transformer](../../../../metadata-ingestion/docs/transformer/dataset_transformer.md#simple-add-dataset-globaltags) with `semantics: PATCH` tags instead of `OVERWRITE`.

## Profiling

The profiling implementation is done through querying [DAX query endpoint](https://learn.microsoft.com/en-us/rest/api/power-bi/datasets/execute-queries). Therefore the principal needs to have permission to query the datasets to be profiled. Profiling is done with column based queries to be able to handle wide datasets without timeouts.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Additional documentation need to be added:

  1. Please also check this doc: https://datahubproject.io/docs/quick-ingestion-guides/powerbi/setup and add/update any steps if any specific permission is required for profiling. The markdown file is available at docs/quick-ingestion-guides/powerbi/setup.md
  2. On Source class add decorator @capability( SourceCapability.DATA_PROFILING, "Optionally enabled via configuration profiling.enabled", )


## Admin Ingestion vs. Basic Ingestion
PowerBI provides two sets of API i.e. [Basic API and Admin API](https://learn.microsoft.com/en-us/rest/api/power-bi/).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please update the Caveats of setting admin_apis_only to true: and add a bullet point for dataset profiling is not available

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the delay, I added mention that the dataset profiling is not available through the Admin API

Expand Down
18 changes: 18 additions & 0 deletions metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,12 @@ class Constant:
STATUS = "status"
CHART_ID = "powerbi.linkedin.com/charts/{}"
CHART_KEY = "chartKey"
COLUMN_TYPE = "columnType"
DATA_TYPE = "dataType"
DASHBOARD = "dashboard"
DASHBOARDS = "dashboards"
DASHBOARD_KEY = "dashboardKey"
DESCRIPTION = "description"
OWNERSHIP = "ownership"
BROWSERPATH = "browsePaths"
DASHBOARD_INFO = "dashboardInfo"
Expand Down Expand Up @@ -108,6 +111,7 @@ class Constant:
TABLES = "tables"
EXPRESSION = "expression"
SOURCE = "source"
SCHEMA_METADATA = "schemaMetadata"
PLATFORM_NAME = "powerbi"
REPORT_TYPE_NAME = BIAssetSubTypes.REPORT
CHART_COUNT = "chartCount"
Expand Down Expand Up @@ -228,6 +232,13 @@ class OwnershipMapping(ConfigModel):
)


class PowerBiProfilingConfig(ConfigModel):
enabled: bool = pydantic.Field(
default=False,
description="Whether profiling of PowerBI datasets should be done",
)


class PowerBiDashboardSourceConfig(
StatefulIngestionConfigBase, DatasetSourceConfigMixin
):
Expand Down Expand Up @@ -405,6 +416,13 @@ class PowerBiDashboardSourceConfig(
"Works for M-Query where native SQL is used for transformation.",
)

profile_pattern: AllowDenyPattern = pydantic.Field(
default=AllowDenyPattern.allow_all(),
description="Regex patterns to filter tables for profiling during ingestion. Note that only tables "
"allowed by the `table_pattern` will be considered. Matched format is 'datasetname.tablename'",
)
profiling: PowerBiProfilingConfig = PowerBiProfilingConfig()

@root_validator(skip_on_failure=True)
def validate_extract_column_level_lineage(cls, values: Dict) -> Dict:
flags = [
Expand Down
55 changes: 55 additions & 0 deletions metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from datahub.ingestion.source.state.stateful_ingestion_base import (
StatefulIngestionSourceBase,
)
from datahub.metadata._schema_classes import DatasetProfileClass
from datahub.metadata.com.linkedin.pegasus2avro.common import ChangeAuditStamps
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
FineGrainedLineage,
Expand All @@ -58,6 +59,7 @@
CorpUserKeyClass,
DashboardInfoClass,
DashboardKeyClass,
DatasetFieldProfileClass,
DatasetLineageTypeClass,
DatasetPropertiesClass,
GlobalTagsClass,
Expand Down Expand Up @@ -473,9 +475,62 @@ def to_datahub_dataset(
Constant.DATASET,
dataset.tags,
)
self.extract_profile(dataset_mcps, workspace, dataset, table, ds_urn)

return dataset_mcps

def extract_profile(
self,
dataset_mcps: List[MetadataChangeProposalWrapper],
workspace: powerbi_data_classes.Workspace,
dataset: powerbi_data_classes.PowerBIDataset,
table: powerbi_data_classes.Table,
ds_urn: str,
) -> None:
if not self.__config.profiling.enabled:
# Profiling not enabled
return

if not self.__config.profile_pattern.allowed(
f"{workspace.name}.{dataset.name}.{table.name}"
):
logger.info(
f"Table {table.name} in {dataset.name}, not allowed for profiling"
)
return
logger.info(f"Profiling table: {table.name}")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets make it debug


profile = DatasetProfileClass(timestampMillis=builder.get_sys_time())
profile.rowCount = table.row_count
profile.fieldProfiles = []

columns: List[
Union[powerbi_data_classes.Column, powerbi_data_classes.Measure]
] = [*(table.columns or []), *(table.measures or [])]
for column in columns:
allowed_column = self.__config.profile_pattern.allowed(
f"{workspace.name}.{dataset.name}.{table.name}.{column.name}"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please give example value of profile_pattern config to make it clear for user that he needs to specify fully qualified name of table

)
if column.isHidden or not allowed_column:
logger.info(f"Column {column.name} not allowed for profiling")
continue
field_profile = DatasetFieldProfileClass(column.name or "")
field_profile.sampleValues = column.sample_values
field_profile.min = column.min
field_profile.max = column.max
field_profile.uniqueCount = column.unique_count
profile.fieldProfiles.append(field_profile)

profile.columnCount = table.column_count

mcp = MetadataChangeProposalWrapper(
entityType="dataset",
entityUrn=ds_urn,
aspectName="datasetProfile",
aspect=profile,
)
dataset_mcps.append(mcp)

@staticmethod
def transform_tags(tags: List[str]) -> GlobalTagsClass:
return GlobalTagsClass(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ class Column:
columnType: Optional[str] = None
expression: Optional[str] = None
description: Optional[str] = None
min: Optional[str] = None
max: Optional[str] = None
unique_count: Optional[int] = None
sample_values: Optional[List[str]] = None


@dataclass
Expand All @@ -108,6 +112,10 @@ class Measure:
BooleanTypeClass, DateTypeClass, NullTypeClass, NumberTypeClass, StringTypeClass
] = dataclasses.field(default_factory=NullTypeClass)
description: Optional[str] = None
min: Optional[str] = None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better to have MeasureProfiling dataclass and add the reference here

max: Optional[str] = None
unique_count: Optional[int] = None
sample_values: Optional[List[str]] = None


@dataclass
Expand All @@ -117,6 +125,8 @@ class Table:
expression: Optional[str] = None
columns: Optional[List[Column]] = None
measures: Optional[List[Measure]] = None
row_count: Optional[int] = None
column_count: Optional[int] = None

# Pointer to the parent dataset.
dataset: Optional["PowerBIDataset"] = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@
AdminAPIResolver,
RegularAPIResolver,
)
from datahub.ingestion.source.powerbi.rest_api_wrapper.powerbi_profiler import (
PowerBiDatasetProfilingResolver,
)

# Logger instance
logger = logging.getLogger(__name__)
Expand All @@ -47,6 +50,13 @@ def __init__(self, config: PowerBiDashboardSourceConfig) -> None:
tenant_id=self.__config.tenant_id,
)

self.__profiling_resolver = PowerBiDatasetProfilingResolver(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are only two way to get the metadata from PowerBI either using regular API or Admin API. You can clearly see this segregation in PowerBI Rest API documentation: https://learn.microsoft.com/en-us/rest/api/power-bi/.

As per doc it looks like it comes under regular-api, so please add profile_dataset in base resolver and provide implementation in both regular and admin resolver (here it is just pass). Invoke the powerbi_profiler.py from regular api's profile_dataset method

client_id=self.__config.client_id,
client_secret=self.__config.client_secret,
tenant_id=self.__config.tenant_id,
config=self.__config,
)

def log_http_error(self, message: str) -> Any:
logger.warning(message)
_, e, _ = sys.exc_info()
Expand Down Expand Up @@ -286,11 +296,12 @@ def _parse_endorsement(endorsements: Optional[dict]) -> List[str]:

return [endorsement]

def _get_workspace_datasets(self, scan_result: Optional[dict]) -> dict:
def _get_workspace_datasets(self, workspace: Workspace) -> dict:
"""
Filter out "dataset" from scan_result and return Dataset instance set
"""
dataset_map: dict = {}
scan_result = workspace.scan_result

if scan_result is None:
return dataset_map
Expand Down Expand Up @@ -344,30 +355,33 @@ def _get_workspace_datasets(self, scan_result: Optional[dict]) -> dict:
and len(table[Constant.SOURCE]) > 0
else None
)
dataset_instance.tables.append(
Table(
name=table[Constant.NAME],
full_name="{}.{}".format(
dataset_name.replace(" ", "_"),
table[Constant.NAME].replace(" ", "_"),
),
expression=expression,
columns=[
Column(
**column,
datahubDataType=FIELD_TYPE_MAPPING.get(
column["dataType"], FIELD_TYPE_MAPPING["Null"]
),
)
for column in table.get("columns", [])
],
measures=[
Measure(**measure) for measure in table.get("measures", [])
],
dataset=dataset_instance,
)
table = Table(
name=table[Constant.NAME],
full_name="{}.{}".format(
dataset_name.replace(" ", "_"),
table[Constant.NAME].replace(" ", "_"),
),
expression=expression,
columns=[
Column(
**column,
datahubDataType=FIELD_TYPE_MAPPING.get(
column["dataType"], FIELD_TYPE_MAPPING["Null"]
),
)
for column in table.get("columns", [])
],
measures=[
Measure(**measure) for measure in table.get("measures", [])
],
dataset=dataset_instance,
row_count=None,
column_count=None,
)

self.__profiling_resolver.profile_dataset(
dataset_instance, table, workspace.name
)
dataset_instance.tables.append(table)
return dataset_map

def _fill_metadata_from_scan_result(
Expand All @@ -392,9 +406,7 @@ def _fill_metadata_from_scan_result(
independent_datasets=[],
)
cur_workspace.scan_result = workspace_metadata
cur_workspace.datasets = self._get_workspace_datasets(
cur_workspace.scan_result
)
cur_workspace.datasets = self._get_workspace_datasets(cur_workspace)

# Fetch endorsements tag if it is enabled from configuration
if self.__config.extract_endorsements_to_tags:
Expand Down
Loading
Loading