datahub-project · shirshanka · Jan 27, 2022 · Jan 17, 2022
diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js
@@ -112,6 +112,7 @@ module.exports = {
           "metadata-ingestion/adding-source",
           "docs/how/add-custom-ingestion-source",
           "docs/how/add-custom-data-platform",
+          "docs/platform-instances",
           "docs/how/add-user-data",
         ],
       },

diff --git a/docs/cli.md b/docs/cli.md
@@ -179,3 +179,59 @@ datahub --debug put --urn "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDa
 curl -X POST -H 'User-Agent: python-requests/2.26.0' -H 'Accept-Encoding: gzip, deflate' -H 'Accept: */*' -H 'Connection: keep-alive' -H 'X-RestLi-Protocol-Version: 2.0.0' -H 'Content-Type: application/json' --data '{"proposal": {"entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)", "aspectName": "ownership", "changeType": "UPSERT", "aspect": {"contentType": "application/json", "value": "{\"owners\": [{\"owner\": \"urn:li:corpuser:jdoe\", \"type\": \"DEVELOPER\"}, {\"owner\": \"urn:li:corpuser:jdub\", \"type\": \"DATAOWNER\"}]}"}}}' 'http://localhost:8080/aspects/?action=ingestProposal'
 Update succeeded with status 200
 ```
+
+### migrate
+
+The `migrate` group of commands allows you to perform certain kinds of migrations. 
+
+#### dataplatform2instance
+
+The `dataplatform2instance` migration command allows you to migrate your entities from an instance-agnostic platform identifier to an instance-specific platform identifier. If you have ingested metadata in the past for this platform and would like to transfer any important metadata over to the new instance-specific entities, then you should use this command. For example, if your users have added documentation or added tags or terms to your datasets, then you should run this command to transfer this metadata over to the new entities. For further context, read the Platform Instance Guide [here](./platform-instances.md).
+
+A few important options worth calling out:
+- --dry-run / -n : Use this to get a report for what will be migrated before running
+- --force / -F : Use this if you know what you are doing and do not want to get a confirmation prompt before migration is started
+- --keep : When enabled, will preserve the old entities and not delete them. Default behavior is to soft-delete old entities.
+- --hard : When enabled, will hard-delete the old entities.
+
+**_Note_**: Timeseries aspects such as Usage Statistics and Dataset Profiles are not migrated over to the new entity instances, you will get new data points created when you re-run ingestion using the `usage` or sources with profiling turned on.
+
+##### Dry Run
+```console
+datahub migrate dataplatform2instance --platform elasticsearch --instance prod_index --dry-run
+Starting migration: platform:elasticsearch, instance=prod_index, force=False, dry-run=True
+100% (25 of 25) |####################################################################################################################################################################################| Elapsed Time: 0:00:00 Time:  0:00:00
+[Dry Run] Migration Report:
+--------------
+[Dry Run] Migration Run Id: migrate-5710349c-1ec7-4b83-a7d3-47d71b7e972e
+[Dry Run] Num entities created = 25
+[Dry Run] Num entities affected = 0
+[Dry Run] Num entities migrated = 25
+[Dry Run] Details:
+[Dry Run] New Entities Created: {'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.datahubretentionindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.schemafieldindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.system_metadata_service_v1,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.tagindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.dataset_datasetprofileaspect_v1,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.mlmodelindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.mlfeaturetableindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.datajob_datahubingestioncheckpointaspect_v1,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.datahub_usage_event,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.dataset_operationaspect_v1,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.datajobindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.dataprocessindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.glossarytermindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.dataplatformindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.mlmodeldeploymentindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.datajob_datahubingestionrunsummaryaspect_v1,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.graph_service_v1,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.datahubpolicyindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.dataset_datasetusagestatisticsaspect_v1,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.dashboardindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.glossarynodeindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.mlfeatureindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.dataflowindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.mlprimarykeyindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,prod_index.chartindex_v2,PROD)'}
+[Dry Run] External Entities Affected: None
+[Dry Run] Old Entities Migrated = {'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,dataset_datasetusagestatisticsaspect_v1,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,mlmodelindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,mlmodeldeploymentindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,datajob_datahubingestionrunsummaryaspect_v1,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,datahubretentionindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,datahubpolicyindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,dataset_datasetprofileaspect_v1,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,glossarynodeindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,dataset_operationaspect_v1,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,graph_service_v1,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,datajobindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,mlprimarykeyindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,dashboardindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,datajob_datahubingestioncheckpointaspect_v1,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,tagindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,datahub_usage_event,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,schemafieldindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,mlfeatureindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,dataprocessindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,dataplatformindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,mlfeaturetableindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,glossarytermindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,dataflowindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,chartindex_v2,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:elasticsearch,system_metadata_service_v1,PROD)'}
+```
+
+##### Real Migration (with soft-delete)
+```
+> datahub migrate dataplatform2instance --platform hive --instance 
+datahub migrate dataplatform2instance --platform hive --instance warehouse
+Starting migration: platform:hive, instance=warehouse, force=False, dry-run=False
+Will migrate 4 urns such as ['urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)']
+New urns will look like ['urn:li:dataset:(urn:li:dataPlatform:hive,warehouse.logging_events,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:hive,warehouse.fct_users_created,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:hive,warehouse.SampleHiveDataset,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:hive,warehouse.fct_users_deleted,PROD)']
+
+Ok to proceed? [y/N]:
+...
+Migration Report:
+--------------
+Migration Run Id: migrate-f5ae7201-4548-4bee-aed4-35758bb78c89
+Num entities created = 4
+Num entities affected = 0
+Num entities migrated = 4
+Details:
+New Entities Created: {'urn:li:dataset:(urn:li:dataPlatform:hive,warehouse.SampleHiveDataset,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:hive,warehouse.fct_users_deleted,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:hive,warehouse.logging_events,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:hive,warehouse.fct_users_created,PROD)'}
+External Entities Affected: None
+Old Entities Migrated = {'urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)'}
+```
+
diff --git a/docs/imgs/platform-instances-for-ingestion.png b/docs/imgs/platform-instances-for-ingestion.png
diff --git a/docs/platform-instances.md b/docs/platform-instances.md
@@ -0,0 +1,44 @@
+# Working With Platform Instances
+
+DataHub's metadata model for Datasets supports a three-part key currently: 
+- Data Platform (e.g. urn:li:dataPlatform:mysql)
+- Name (e.g. db.schema.name)
+- Env or Fabric (e.g. DEV, PROD, etc.)
+
+This naming scheme unfortunately does not allow for easy representation of the multiplicity of platforms (or technologies) that might be deployed at an organization within the same environment or fabric. For example, an organization might have multiple Redshift instances in Production and would want to see all the data assets located in those instances inside the DataHub metadata repository. 
+
+As part of the `v0.8.24+` releases, we are unlocking the first phase of supporting Platform Instances in the metadata model. This is done via two main additions:
+- The `dataPlatformInstance` aspect that has been added to Datasets which allows datasets to be associated to an instance of a platform
+- Enhancements to all ingestion sources that allow them to attach a platform instance to the recipe that changes the generated urns to go from `urn:li:dataset:(urn:li:dataPlatform:<platform>,<name>,ENV)` format to `urn:li:dataset:(urn:li:dataPlatform:<platform>,<instance.name>,ENV)` format. Sources that produce lineage to datasets in other platforms (e.g. Looker, Superset etc) also have specific configuration additions that allow the recipe author to specify the mapping between a platform and the instance name that it should be mapped to. 
+
+![./imgs/platform-instances-for-ingestion.png](./imgs/platform-instances-for-ingestion.png)
+
+## Naming Platform Instances
+
+When configuring a platform instance, choose an instance name that is understandable and will be stable for the foreseeable future. e.g. `core_warehouse` or `finance_redshift` are allowed names, as are pure guids like `a37dc708-c512-4fe4-9829-401cd60ed789`. Remember that whatever instance name you choose, you will need to specify it in more than one recipe to ensure that the identifiers produced by different sources will line up.
+
+## Enabling Platform Instances
+
+Read the Ingestion source specific guides for how to enable platform instances in each of them. 
+The general pattern is to add an additional optional configuration parameter called `platform_instance`. 
+
+e.g. here is how you would configure a recipe to ingest a mysql instance that you want to call `core_finance`
+```yaml
+source:
+  type: mysql
+  config:
+    # Coordinates
+    host_port: localhost:3306
+    platform_instance: core_finance
+    database: dbname
+
+    # Credentials
+    username: root
+    password: example
+
+sink:
+  # sink configs
+```
+
+
+## 
diff --git a/metadata-ingestion/source_docs/athena.md b/metadata-ingestion/source_docs/athena.md
@@ -14,6 +14,10 @@ This plugin extracts the following:
 - Column types associated with each table
 - Table, row, and column statistics via optional [SQL profiling](./sql_profiles.md)
 
+| Capability | Status | Details | 
+| -----------| ------ | ---- |
+| Platform Instance | ✔️ | [link](../../docs/platform-instances.md) |
+
 ## Quickstart recipe
 
 Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration options.
@@ -54,7 +58,8 @@ As a SQL-based service, the Athena integration is also supported by our SQL prof
 | `aws_region`                | ✅       |              | AWS region code.                                                                                                                                                                                           |
 | `s3_staging_dir`            | ✅       |              | Of format `"s3://<bucket-name>/prefix/"`. The `s3_staging_dir` parameter is needed because Athena always writes query results to S3. <br />See https://docs.aws.amazon.com/athena/latest/ug/querying.html. |
 | `work_group`                | ✅       |              | Name of Athena workgroup. <br />See https://docs.aws.amazon.com/athena/latest/ug/manage-queries-control-costs-with-workgroups.html.                                                                        |
-| `env`                       |          | `"PROD"`     | Environment to use in namespace when constructing URNs.                                                                                                                                                    |
+| `env`                       |          | `"PROD"`     | Environment to use in namespace when constructing URNs.                                                                                                                                          |
+| `platform_instance`         |          | None             | The Platform instance to use while constructing URNs.         |
 | `options.<option>`          |          |              | Any options specified here will be passed to SQLAlchemy's `create_engine` as kwargs.<br />See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details.                    |
 | `table_pattern.allow`       |          |              | List of regex patterns for tables to include in ingestion.                                                                                                                                                 |
 | `table_pattern.deny`        |          |              | List of regex patterns for tables to exclude from ingestion.                                                                                                                                               |

diff --git a/metadata-ingestion/source_docs/bigquery.md b/metadata-ingestion/source_docs/bigquery.md
@@ -73,6 +73,10 @@ This plugin extracts the following:
 - Table, row, and column statistics via optional [SQL profiling](./sql_profiles.md)
 - Table level lineage.
 
+| Capability | Status | Details | 
+| -----------| ------ | ---- |
+| Platform Instance | 🛑 | BigQuery doesn't need platform instances because `project ids` in BigQuery are globally unique. [link](../../docs/platform-instances.md) |
+
 :::tip
 
 You can also get fine-grained usage statistics for BigQuery using the `bigquery-usage` source described below.

diff --git a/metadata-ingestion/source_docs/data_lake.md b/metadata-ingestion/source_docs/data_lake.md
@@ -51,6 +51,11 @@ If you are ingesting datasets from AWS S3, we recommend running the ingestion on
 
 :::
 
+| Capability | Status | Details | 
+| -----------| ------ | ---- |
+| Platform Instance | 🛑 | [link](../../docs/platform-instances.md) |
+
+
 ## Quickstart recipe
 
 Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration options.

diff --git a/metadata-ingestion/source_docs/druid.md b/metadata-ingestion/source_docs/druid.md
@@ -8,6 +8,11 @@ To install this plugin, run `pip install 'acryl-datahub[druid]'`.
 
 ## Capabilities
 
+| Capability | Status | Details | 
+| -----------| ------ | ---- |
+| Platform Instance | ✔️ | [link](../../docs/platform-instances.md) |
+
+
 This plugin extracts the following:
 
 - Metadata for databases, schemas, and tables
@@ -51,6 +56,7 @@ As a SQL-based service, the Athena integration is also supported by our SQL prof
 | `database`                  |          |                         | Database to ingest.                                                                                                                                                                     |
 | `database_alias`            |          |                         | Alias to apply to database when ingesting.                                                                                                                                              |
 | `env`                       |          | `"PROD"`                | Environment to use in namespace when constructing URNs.                                                                                                                                 |
+| `platform_instance`         |          | None             | The Platform instance to use while constructing URNs.         |
 | `options.<option>`          |          |                         | Any options specified here will be passed to SQLAlchemy's `create_engine` as kwargs.<br />See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details. |
 | `table_pattern.allow`       |          |                         | List of regex patterns for tables to include in ingestion.                                                                                                                              |
 | `table_pattern.deny`        |          |                         | List of regex patterns for tables to exclude from ingestion.                                                                                                                            |

diff --git a/metadata-ingestion/source_docs/elastic_search.md b/metadata-ingestion/source_docs/elastic_search.md
@@ -13,6 +13,11 @@ This plugin extracts the following:
 - Metadata for indexes
 - Column types associated with each index field
 
+| Capability | Status | Details | 
+| -----------| ------ | ---- |
+| Platform Instance | ✔️ | [link](../../docs/platform-instances.md) |
+
+
 ## Quickstart recipe
 
 Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration options.
@@ -49,6 +54,7 @@ Note that a `.` is used to denote nested fields in the YAML recipe.
 | `username`                  |          | ""               | The username credential.                                      |
 | `password`                  |          | ""               | The password credential.                                      |
 | `env`                       |          | `"PROD"`         | Environment to use in namespace when constructing URNs.       |
+| `platform_instance`         |          | None             | The Platform instance to use while constructing URNs.         |
 | `index_pattern.allow`       |          |                  | List of regex patterns for indexes to include in ingestion.   |
 | `index_pattern.deny`        |          |                  | List of regex patterns for indexes to exclude from ingestion. |
 | `index_pattern.ignoreCase`  |          | `True`           | Whether regex matching should ignore case or not              |

diff --git a/metadata-ingestion/source_docs/glue.md b/metadata-ingestion/source_docs/glue.md
@@ -17,6 +17,10 @@ This plugin extracts the following:
 - Table metadata, such as owner, description and parameters
 - Jobs and their component transformations, data sources, and data sinks
 
+| Capability | Status | Details | 
+| -----------| ------ | ---- |
+| Platform Instance | 🛑 | [link](../../docs/platform-instances.md) |
+
 ## Quickstart recipe
 
 Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration options.