From ad3b8f9b09431f70a5ff895c32cac00c430573ad Mon Sep 17 00:00:00 2001
From: Hyejin Yoon <0327jane@gmail.com>
Date: Wed, 22 May 2024 15:25:49 +0900
Subject: [PATCH] docs: improve lineage docs (#10396)
---
docs/api/tutorials/lineage.md | 166 ++++++++++++------
.../library/read_lineage_datajob_rest.py | 13 ++
.../library/read_lineage_dataset_rest.py | 13 ++
.../library/read_lineage_execute_graphql.py | 44 +++++
.../examples/library/read_lineage_rest.py | 43 -----
5 files changed, 187 insertions(+), 92 deletions(-)
create mode 100644 metadata-ingestion/examples/library/read_lineage_datajob_rest.py
create mode 100644 metadata-ingestion/examples/library/read_lineage_dataset_rest.py
create mode 100644 metadata-ingestion/examples/library/read_lineage_execute_graphql.py
delete mode 100644 metadata-ingestion/examples/library/read_lineage_rest.py
diff --git a/docs/api/tutorials/lineage.md b/docs/api/tutorials/lineage.md
index c30307098d613a..daa8f548b2360f 100644
--- a/docs/api/tutorials/lineage.md
+++ b/docs/api/tutorials/lineage.md
@@ -15,6 +15,7 @@ This guide will show you how to
- Add lineage between datasets.
- Add column-level lineage between datasets.
+- Read lineage.
## Prerequisites
@@ -109,7 +110,7 @@ Expected Response:
-### Expected Outcomes of Adding Lineage
+### Expected Outcome
You can now see the lineage between `fct_users_deleted` and `logging_events`.
@@ -117,6 +118,7 @@ You can now see the lineage between `fct_users_deleted` and `logging_events`.
+
## Add Column-level Lineage
@@ -129,7 +131,7 @@ You can now see the lineage between `fct_users_deleted` and `logging_events`.
-### Expected Outcome of Adding Column Level Lineage
+### Expected Outcome
You can now see the column-level lineage between datasets. Note that you have to enable `Show Columns` to be able to see the column-level lineage.
@@ -137,18 +139,30 @@ You can now see the column-level lineage between datasets. Note that you have to
-## Read Table Lineage
+## Add Lineage to Non-Dataset Entities
+
+You can also add lineage to non-dataset entities, such as DataJobs, Charts, and Dashboards.
+Please refer to the following examples.
+
+| Connection | Examples | A.K.A |
+|---------------------|-------------------|-----------------|
+| DataJob to DataFlow | - [lineage_job_dataflow.py](../../../metadata-ingestion/examples/library/lineage_job_dataflow.py) | |
+| DataJob to Dataset | - [lineage_dataset_job_dataset.py](../../../metadata-ingestion/examples/library/lineage_dataset_job_dataset.py)
| Pipeline Lineage |
+| Chart to Dashboard | - [lineage_chart_dashboard.py](../../../metadata-ingestion/examples/library/lineage_chart_dashboard.py) | |
+| Chart to Dataset | - [lineage_dataset_chart.py](../../../metadata-ingestion/examples/library/lineage_dataset_chart.py) | |
+
+
+## Read Lineage (Lineage Impact Analysis)
```graphql
-query searchAcrossLineage {
- searchAcrossLineage(
+query scrollAcrossLineage {
+ scrollAcrossLineage(
input: {
query: "*"
- urn: "urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD)"
- start: 0
+ urn: "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)"
count: 10
direction: DOWNSTREAM
orFilters: [
@@ -175,8 +189,13 @@ query searchAcrossLineage {
}
}
```
+:::info Degree
+Note that `degree` means the number of hops in the lineage. For example, `degree: 1` means the immediate downstream entities, `degree: 2` means the entities that are two hops away, and so on.
+:::
+
+The GraphQL example shows using lineage degrees as a filter, but additional search filters can be included here as well.
+This will perform a multi-hop lineage search on the urn specified. For more information about the `scrollAcrossLineage` mutation, please refer to [scrollAcrossLineage](https://datahubproject.io/docs/graphql/queries/#scrollacrosslineage).
-This example shows using lineage degrees as a filter, but additional search filters can be included here as well.
@@ -184,7 +203,7 @@ This example shows using lineage degrees as a filter, but additional search filt
```shell
curl --location --request POST 'http://localhost:8080/api/graphql' \
--header 'Authorization: Bearer ' \
---header 'Content-Type: application/json' --data-raw '{ { "query": "query searchAcrossLineage { searchAcrossLineage( input: { query: \"*\" urn: \"urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD)\" start: 0 count: 10 direction: DOWNSTREAM orFilters: [ { and: [ { condition: EQUAL negated: false field: \"degree\" values: [\"1\", \"2\", \"3+\"] } ] } ] } ) { searchResults { degree entity { urn type } } }}"
+--header 'Content-Type: application/json' --data-raw '{ { "query": "query scrollAcrossLineage { scrollAcrossLineage( input: { query: \"*\" urn: \"urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)\" count: 10 direction: DOWNSTREAM orFilters: [ { and: [ { condition: EQUAL negated: false field: \"degree\" values: [\"1\", \"2\", \"3+\"] } ] } ] } ) { searchResults { degree entity { urn type } } }}"
}}'
```
@@ -192,67 +211,116 @@ curl --location --request POST 'http://localhost:8080/api/graphql' \
```python
-{{ inline /metadata-ingestion/examples/library/read_lineage_rest.py show_path_as_comment }}
+{{ inline /metadata-ingestion/examples/library/read_lineage_execute_graphql.py show_path_as_comment }}
```
+The Python SDK example shows how to read lineage of a dataset. Please note that the `aspect_type` parameter can vary depending on the entity type.
+Below is a few examples of `aspect_type` for different entities.
+
+|Entity|Aspect_type| Reference |
+|-------|------------|--------------------------------------------------------------------------|
+|Dataset|`UpstreamLineageClass`| [Link](/docs/generated/metamodel/entities/dataset.md#upstreamlineage) |
+|Datajob|`DataJobInputOutputClass`| [Link](/docs/generated/metamodel/entities/dataJob.md#datajobinputoutput) |
+|Dashboard|`DashboardInfoClass`| [Link](/docs/generated/metamodel/entities/dashboard.md#dashboardinfo) |
+|DataFlow|`DataFlowInfoClass`| [Link](/docs/generated/metamodel/entities/dataFlow.md#dataflowinfo) |
+
+Learn more about lineages of different entities in the [Add Lineage to Non-Dataset Entities](#add-lineage-to-non-dataset-entities) Section.
-This will perform a multi-hop lineage search on the urn specified. For more information about the `searchAcrossLineage` mutation, please refer to [searchAcrossLineage](https://datahubproject.io/docs/graphql/queries/#searchacrosslineage).
-## Read Column Lineage
+### Expected Outcome
-
-
+As an outcome, you should see the downstream entities of `logging_events`.
```graphql
-query searchAcrossLineage {
- searchAcrossLineage(
- input: {
- query: "*"
- urn: "urn:li:schemaField(urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD),profile_id)"
- start: 0
- count: 10
- direction: DOWNSTREAM
- orFilters: [
+{
+ "data": {
+ "scrollAcrossLineage": {
+ "searchResults": [
{
- and: [
- {
- condition: EQUAL
- negated: false
- field: "degree"
- values: ["1", "2", "3+"]
- }
- ]
+ "degree": 1,
+ "entity": {
+ "urn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_123)",
+ "type": "DATA_JOB"
+ }
+ },
+ ...
+ {
+ "degree": 2,
+ "entity": {
+ "urn": "urn:li:mlPrimaryKey:(user_analytics,user_name)",
+ "type": "MLPRIMARY_KEY"
+ }
}
]
}
- ) {
- searchResults {
- degree
- entity {
- urn
- type
- }
- }
- }
+ },
+ "extensions": {}
}
```
-This example shows using lineage degrees as a filter, but additional search filters can be included here as well.
+## Read Column-level Lineage
-
-
+You can also read column-level lineage via Python SDK.
+
+
+
+
+
+```python
+{{ inline /metadata-ingestion/examples/library/read_lineage_dataset_rest.py show_path_as_comment }}
-```shell
-curl --location --request POST 'http://localhost:8080/api/graphql' \
---header 'Authorization: Bearer ' \
---header 'Content-Type: application/json' --data-raw '{ { "query": "query searchAcrossLineage { searchAcrossLineage( input: { query: \"*\" urn: \"urn:li:schemaField(urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD),profile_id)\" start: 0 count: 10 direction: DOWNSTREAM orFilters: [ { and: [ { condition: EQUAL negated: false field: \"degree\" values: [\"1\", \"2\", \"3+\"] } ] } ] } ) { searchResults { degree entity { urn type } } }}"
-}}'
```
-This will perform a multi-hop lineage search on the urn specified. You can see schemaField URNs are made up of two parts: first the table they are a column of, and second the path of the column. For more information about the `searchAcrossLineage` mutation, please refer to [searchAcrossLineage](https://datahubproject.io/docs/graphql/queries/#searchacrosslineage).
+### Expected Outcome
+
+As a response, you will get the full lineage information like this.
+
+```graphql
+{
+ "UpstreamLineageClass": {
+ "upstreams": [
+ {
+ "UpstreamClass": {
+ "auditStamp": {
+ "AuditStampClass": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown",
+ "impersonator": null,
+ "message": null
+ }
+ },
+ "created": null,
+ "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)",
+ "type": "TRANSFORMED",
+ "properties": null,
+ "query": null
+ }
+ }
+ ],
+ "fineGrainedLineages": [
+ {
+ "FineGrainedLineageClass": {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD),browser_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD),user_id)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD),browser)"
+ ],
+ "transformOperation": null,
+ "confidenceScore": 1.0,
+ "query": null
+ }
+ }
+ ]
+ }
+}
+```
diff --git a/metadata-ingestion/examples/library/read_lineage_datajob_rest.py b/metadata-ingestion/examples/library/read_lineage_datajob_rest.py
new file mode 100644
index 00000000000000..e23c1ee3106416
--- /dev/null
+++ b/metadata-ingestion/examples/library/read_lineage_datajob_rest.py
@@ -0,0 +1,13 @@
+from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
+
+# Imports for metadata model classes
+from datahub.metadata.schema_classes import DataJobInputOutputClass
+
+# Get the current lineage for a datajob
+gms_endpoint = "http://localhost:8080"
+graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))
+
+urn = "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)"
+result = graph.get_aspect(entity_urn=urn, aspect_type=DataJobInputOutputClass)
+
+print(result)
diff --git a/metadata-ingestion/examples/library/read_lineage_dataset_rest.py b/metadata-ingestion/examples/library/read_lineage_dataset_rest.py
new file mode 100644
index 00000000000000..5e3e4b643e4fe7
--- /dev/null
+++ b/metadata-ingestion/examples/library/read_lineage_dataset_rest.py
@@ -0,0 +1,13 @@
+from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
+
+# Imports for metadata model classes
+from datahub.metadata.schema_classes import UpstreamLineageClass
+
+# Get the current lineage for a dataset
+gms_endpoint = "http://localhost:8080"
+graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))
+
+urn = "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)"
+result = graph.get_aspect(entity_urn=urn, aspect_type=UpstreamLineageClass)
+
+print(result)
diff --git a/metadata-ingestion/examples/library/read_lineage_execute_graphql.py b/metadata-ingestion/examples/library/read_lineage_execute_graphql.py
new file mode 100644
index 00000000000000..7b7f8ef43f4f5e
--- /dev/null
+++ b/metadata-ingestion/examples/library/read_lineage_execute_graphql.py
@@ -0,0 +1,44 @@
+# read-modify-write requires access to the DataHubGraph (RestEmitter is not enough)
+from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
+
+gms_endpoint = "http://localhost:8080"
+graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))
+
+# Query multiple aspects from entity
+query = """
+query scrollAcrossLineage($input: ScrollAcrossLineageInput!) {
+ scrollAcrossLineage(input: $input) {
+ searchResults {
+ degree
+ entity {
+ urn
+ type
+ }
+ }
+ }
+}
+"""
+
+variables = {
+ "input": {
+ "query": "*",
+ "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)",
+ "count": 10,
+ "direction": "DOWNSTREAM",
+ "orFilters": [
+ {
+ "and": [
+ {
+ "condition": "EQUAL",
+ "negated": "false",
+ "field": "degree",
+ "values": ["1", "2", "3+"],
+ }
+ ]
+ }
+ ],
+ }
+}
+result = graph.execute_graphql(query=query, variables=variables)
+
+print(result)
diff --git a/metadata-ingestion/examples/library/read_lineage_rest.py b/metadata-ingestion/examples/library/read_lineage_rest.py
deleted file mode 100644
index bd9b4e8651dba9..00000000000000
--- a/metadata-ingestion/examples/library/read_lineage_rest.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# read-modify-write requires access to the DataHubGraph (RestEmitter is not enough)
-from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
-
-gms_endpoint = "http://localhost:8080"
-graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))
-
-# Query multiple aspects from entity
-query = """
-query searchAcrossLineage {
- searchAcrossLineage(
- input: {
- query: "*"
- urn: "urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD)"
- start: 0
- count: 10
- direction: DOWNSTREAM
- orFilters: [
- {
- and: [
- {
- condition: EQUAL
- negated: false
- field: "degree"
- values: ["1", "2", "3+"]
- }
- ] # Additional search filters can be included here as well
- }
- ]
- }
- ) {
- searchResults {
- degree
- entity {
- urn
- type
- }
- }
- }
-}
-"""
-result = graph.execute_graphql(query=query)
-
-print(result)