From ad3b8f9b09431f70a5ff895c32cac00c430573ad Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Wed, 22 May 2024 15:25:49 +0900 Subject: [PATCH] docs: improve lineage docs (#10396) --- docs/api/tutorials/lineage.md | 166 ++++++++++++------ .../library/read_lineage_datajob_rest.py | 13 ++ .../library/read_lineage_dataset_rest.py | 13 ++ .../library/read_lineage_execute_graphql.py | 44 +++++ .../examples/library/read_lineage_rest.py | 43 ----- 5 files changed, 187 insertions(+), 92 deletions(-) create mode 100644 metadata-ingestion/examples/library/read_lineage_datajob_rest.py create mode 100644 metadata-ingestion/examples/library/read_lineage_dataset_rest.py create mode 100644 metadata-ingestion/examples/library/read_lineage_execute_graphql.py delete mode 100644 metadata-ingestion/examples/library/read_lineage_rest.py diff --git a/docs/api/tutorials/lineage.md b/docs/api/tutorials/lineage.md index c30307098d613a..daa8f548b2360f 100644 --- a/docs/api/tutorials/lineage.md +++ b/docs/api/tutorials/lineage.md @@ -15,6 +15,7 @@ This guide will show you how to - Add lineage between datasets. - Add column-level lineage between datasets. +- Read lineage. ## Prerequisites @@ -109,7 +110,7 @@ Expected Response: -### Expected Outcomes of Adding Lineage +### Expected Outcome You can now see the lineage between `fct_users_deleted` and `logging_events`. @@ -117,6 +118,7 @@ You can now see the lineage between `fct_users_deleted` and `logging_events`.

+ ## Add Column-level Lineage @@ -129,7 +131,7 @@ You can now see the lineage between `fct_users_deleted` and `logging_events`. -### Expected Outcome of Adding Column Level Lineage +### Expected Outcome You can now see the column-level lineage between datasets. Note that you have to enable `Show Columns` to be able to see the column-level lineage. @@ -137,18 +139,30 @@ You can now see the column-level lineage between datasets. Note that you have to

-## Read Table Lineage +## Add Lineage to Non-Dataset Entities + +You can also add lineage to non-dataset entities, such as DataJobs, Charts, and Dashboards. +Please refer to the following examples. + +| Connection | Examples | A.K.A | +|---------------------|-------------------|-----------------| +| DataJob to DataFlow | - [lineage_job_dataflow.py](../../../metadata-ingestion/examples/library/lineage_job_dataflow.py) | | +| DataJob to Dataset | - [lineage_dataset_job_dataset.py](../../../metadata-ingestion/examples/library/lineage_dataset_job_dataset.py)
| Pipeline Lineage | +| Chart to Dashboard | - [lineage_chart_dashboard.py](../../../metadata-ingestion/examples/library/lineage_chart_dashboard.py) | | +| Chart to Dataset | - [lineage_dataset_chart.py](../../../metadata-ingestion/examples/library/lineage_dataset_chart.py) | | + + +## Read Lineage (Lineage Impact Analysis) ```graphql -query searchAcrossLineage { - searchAcrossLineage( +query scrollAcrossLineage { + scrollAcrossLineage( input: { query: "*" - urn: "urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD)" - start: 0 + urn: "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)" count: 10 direction: DOWNSTREAM orFilters: [ @@ -175,8 +189,13 @@ query searchAcrossLineage { } } ``` +:::info Degree +Note that `degree` means the number of hops in the lineage. For example, `degree: 1` means the immediate downstream entities, `degree: 2` means the entities that are two hops away, and so on. +::: + +The GraphQL example shows using lineage degrees as a filter, but additional search filters can be included here as well. +This will perform a multi-hop lineage search on the urn specified. For more information about the `scrollAcrossLineage` mutation, please refer to [scrollAcrossLineage](https://datahubproject.io/docs/graphql/queries/#scrollacrosslineage). -This example shows using lineage degrees as a filter, but additional search filters can be included here as well. @@ -184,7 +203,7 @@ This example shows using lineage degrees as a filter, but additional search filt ```shell curl --location --request POST 'http://localhost:8080/api/graphql' \ --header 'Authorization: Bearer ' \ ---header 'Content-Type: application/json' --data-raw '{ { "query": "query searchAcrossLineage { searchAcrossLineage( input: { query: \"*\" urn: \"urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD)\" start: 0 count: 10 direction: DOWNSTREAM orFilters: [ { and: [ { condition: EQUAL negated: false field: \"degree\" values: [\"1\", \"2\", \"3+\"] } ] } ] } ) { searchResults { degree entity { urn type } } }}" +--header 'Content-Type: application/json' --data-raw '{ { "query": "query scrollAcrossLineage { scrollAcrossLineage( input: { query: \"*\" urn: \"urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)\" count: 10 direction: DOWNSTREAM orFilters: [ { and: [ { condition: EQUAL negated: false field: \"degree\" values: [\"1\", \"2\", \"3+\"] } ] } ] } ) { searchResults { degree entity { urn type } } }}" }}' ``` @@ -192,67 +211,116 @@ curl --location --request POST 'http://localhost:8080/api/graphql' \ ```python -{{ inline /metadata-ingestion/examples/library/read_lineage_rest.py show_path_as_comment }} +{{ inline /metadata-ingestion/examples/library/read_lineage_execute_graphql.py show_path_as_comment }} ``` +The Python SDK example shows how to read lineage of a dataset. Please note that the `aspect_type` parameter can vary depending on the entity type. +Below is a few examples of `aspect_type` for different entities. + +|Entity|Aspect_type| Reference | +|-------|------------|--------------------------------------------------------------------------| +|Dataset|`UpstreamLineageClass`| [Link](/docs/generated/metamodel/entities/dataset.md#upstreamlineage) | +|Datajob|`DataJobInputOutputClass`| [Link](/docs/generated/metamodel/entities/dataJob.md#datajobinputoutput) | +|Dashboard|`DashboardInfoClass`| [Link](/docs/generated/metamodel/entities/dashboard.md#dashboardinfo) | +|DataFlow|`DataFlowInfoClass`| [Link](/docs/generated/metamodel/entities/dataFlow.md#dataflowinfo) | + +Learn more about lineages of different entities in the [Add Lineage to Non-Dataset Entities](#add-lineage-to-non-dataset-entities) Section. -This will perform a multi-hop lineage search on the urn specified. For more information about the `searchAcrossLineage` mutation, please refer to [searchAcrossLineage](https://datahubproject.io/docs/graphql/queries/#searchacrosslineage). -## Read Column Lineage +### Expected Outcome - - +As an outcome, you should see the downstream entities of `logging_events`. ```graphql -query searchAcrossLineage { - searchAcrossLineage( - input: { - query: "*" - urn: "urn:li:schemaField(urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD),profile_id)" - start: 0 - count: 10 - direction: DOWNSTREAM - orFilters: [ +{ + "data": { + "scrollAcrossLineage": { + "searchResults": [ { - and: [ - { - condition: EQUAL - negated: false - field: "degree" - values: ["1", "2", "3+"] - } - ] + "degree": 1, + "entity": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_123)", + "type": "DATA_JOB" + } + }, + ... + { + "degree": 2, + "entity": { + "urn": "urn:li:mlPrimaryKey:(user_analytics,user_name)", + "type": "MLPRIMARY_KEY" + } } ] } - ) { - searchResults { - degree - entity { - urn - type - } - } - } + }, + "extensions": {} } ``` -This example shows using lineage degrees as a filter, but additional search filters can be included here as well. +## Read Column-level Lineage - - +You can also read column-level lineage via Python SDK. + + + + + +```python +{{ inline /metadata-ingestion/examples/library/read_lineage_dataset_rest.py show_path_as_comment }} -```shell -curl --location --request POST 'http://localhost:8080/api/graphql' \ ---header 'Authorization: Bearer ' \ ---header 'Content-Type: application/json' --data-raw '{ { "query": "query searchAcrossLineage { searchAcrossLineage( input: { query: \"*\" urn: \"urn:li:schemaField(urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD),profile_id)\" start: 0 count: 10 direction: DOWNSTREAM orFilters: [ { and: [ { condition: EQUAL negated: false field: \"degree\" values: [\"1\", \"2\", \"3+\"] } ] } ] } ) { searchResults { degree entity { urn type } } }}" -}}' ``` -This will perform a multi-hop lineage search on the urn specified. You can see schemaField URNs are made up of two parts: first the table they are a column of, and second the path of the column. For more information about the `searchAcrossLineage` mutation, please refer to [searchAcrossLineage](https://datahubproject.io/docs/graphql/queries/#searchacrosslineage). +### Expected Outcome + +As a response, you will get the full lineage information like this. + +```graphql +{ + "UpstreamLineageClass": { + "upstreams": [ + { + "UpstreamClass": { + "auditStamp": { + "AuditStampClass": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null, + "message": null + } + }, + "created": null, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)", + "type": "TRANSFORMED", + "properties": null, + "query": null + } + } + ], + "fineGrainedLineages": [ + { + "FineGrainedLineageClass": { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD),browser_id)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD),user_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD),browser)" + ], + "transformOperation": null, + "confidenceScore": 1.0, + "query": null + } + } + ] + } +} +``` diff --git a/metadata-ingestion/examples/library/read_lineage_datajob_rest.py b/metadata-ingestion/examples/library/read_lineage_datajob_rest.py new file mode 100644 index 00000000000000..e23c1ee3106416 --- /dev/null +++ b/metadata-ingestion/examples/library/read_lineage_datajob_rest.py @@ -0,0 +1,13 @@ +from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph + +# Imports for metadata model classes +from datahub.metadata.schema_classes import DataJobInputOutputClass + +# Get the current lineage for a datajob +gms_endpoint = "http://localhost:8080" +graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint)) + +urn = "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)" +result = graph.get_aspect(entity_urn=urn, aspect_type=DataJobInputOutputClass) + +print(result) diff --git a/metadata-ingestion/examples/library/read_lineage_dataset_rest.py b/metadata-ingestion/examples/library/read_lineage_dataset_rest.py new file mode 100644 index 00000000000000..5e3e4b643e4fe7 --- /dev/null +++ b/metadata-ingestion/examples/library/read_lineage_dataset_rest.py @@ -0,0 +1,13 @@ +from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph + +# Imports for metadata model classes +from datahub.metadata.schema_classes import UpstreamLineageClass + +# Get the current lineage for a dataset +gms_endpoint = "http://localhost:8080" +graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint)) + +urn = "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)" +result = graph.get_aspect(entity_urn=urn, aspect_type=UpstreamLineageClass) + +print(result) diff --git a/metadata-ingestion/examples/library/read_lineage_execute_graphql.py b/metadata-ingestion/examples/library/read_lineage_execute_graphql.py new file mode 100644 index 00000000000000..7b7f8ef43f4f5e --- /dev/null +++ b/metadata-ingestion/examples/library/read_lineage_execute_graphql.py @@ -0,0 +1,44 @@ +# read-modify-write requires access to the DataHubGraph (RestEmitter is not enough) +from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph + +gms_endpoint = "http://localhost:8080" +graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint)) + +# Query multiple aspects from entity +query = """ +query scrollAcrossLineage($input: ScrollAcrossLineageInput!) { + scrollAcrossLineage(input: $input) { + searchResults { + degree + entity { + urn + type + } + } + } +} +""" + +variables = { + "input": { + "query": "*", + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)", + "count": 10, + "direction": "DOWNSTREAM", + "orFilters": [ + { + "and": [ + { + "condition": "EQUAL", + "negated": "false", + "field": "degree", + "values": ["1", "2", "3+"], + } + ] + } + ], + } +} +result = graph.execute_graphql(query=query, variables=variables) + +print(result) diff --git a/metadata-ingestion/examples/library/read_lineage_rest.py b/metadata-ingestion/examples/library/read_lineage_rest.py deleted file mode 100644 index bd9b4e8651dba9..00000000000000 --- a/metadata-ingestion/examples/library/read_lineage_rest.py +++ /dev/null @@ -1,43 +0,0 @@ -# read-modify-write requires access to the DataHubGraph (RestEmitter is not enough) -from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph - -gms_endpoint = "http://localhost:8080" -graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint)) - -# Query multiple aspects from entity -query = """ -query searchAcrossLineage { - searchAcrossLineage( - input: { - query: "*" - urn: "urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD)" - start: 0 - count: 10 - direction: DOWNSTREAM - orFilters: [ - { - and: [ - { - condition: EQUAL - negated: false - field: "degree" - values: ["1", "2", "3+"] - } - ] # Additional search filters can be included here as well - } - ] - } - ) { - searchResults { - degree - entity { - urn - type - } - } - } -} -""" -result = graph.execute_graphql(query=query) - -print(result)