From d796fcc51383a7b3241f045c01359a68ffd7dea3 Mon Sep 17 00:00:00 2001 From: yoonhyejin <0327jane@gmail.com> Date: Tue, 12 Mar 2024 15:18:18 +0900 Subject: [PATCH 1/5] feat: init draft for obs doc --- docs-website/sidebars.js | 49 +++++++----- docs/managed-datahub/observe/assertions.md | 59 +++++++++++++++ docs/managed-datahub/observe/data-contract.md | 75 +++++++++++++++++++ 3 files changed, 166 insertions(+), 17 deletions(-) create mode 100644 docs/managed-datahub/observe/assertions.md create mode 100644 docs/managed-datahub/observe/data-contract.md diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 13bda5d735f3e7..fb9c24196b6c80 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -69,26 +69,41 @@ module.exports = { type: "category", items: [ { - type: "doc", - id: "docs/managed-datahub/observe/freshness-assertions", - className: "saasOnly", - }, - { - type: "doc", - id: "docs/managed-datahub/observe/volume-assertions", - className: "saasOnly", - }, - { - type: "doc", - id: "docs/managed-datahub/observe/custom-sql-assertions", - className: "saasOnly", + label: "Assertions", + type: "category", + link: { + type: "doc", + id: "docs/managed-datahub/observe/assertions", + }, + items: [ + { + type: "doc", + id: "docs/managed-datahub/observe/freshness-assertions", + className: "saasOnly", + }, + { + type: "doc", + id: "docs/managed-datahub/observe/volume-assertions", + className: "saasOnly", + }, + { + type: "doc", + id: "docs/managed-datahub/observe/custom-sql-assertions", + className: "saasOnly", + }, + { + type: "doc", + id: "docs/managed-datahub/observe/column-assertions", + className: "saasOnly", + }, + ], }, { - type: "doc", - id: "docs/managed-datahub/observe/column-assertions", - className: "saasOnly", + type: "doc", + id: "docs/managed-datahub/observe/data-contract", + className: "saasOnly", }, - ], + ], }, { Guides: ["docs/features/feature-guides/ui-lineage"], diff --git a/docs/managed-datahub/observe/assertions.md b/docs/managed-datahub/observe/assertions.md new file mode 100644 index 00000000000000..54b4b3c9b56d91 --- /dev/null +++ b/docs/managed-datahub/observe/assertions.md @@ -0,0 +1,59 @@ +# Assertions + +_Note: currently we support Snowflake, Databricks, Redshift, and BigQuery for out-of-the-box contract monitoring as part of Acryl Observe._ + + +## What is an Assertion + +An assertion is a data quality test that finds data that violate one or more specified rules. These serve as the backbone of Data Contracts – this is how we verify the contract is met.  + + +## How to Create and Run Assertions + +Data quality tests (a.k.a. assertions) can be run by either Acryl or ingested from a 3rd party tool.  + + +### 3rd Party Runners + +You can integrate 3rd party tools as follows: + +- [DBT Test](https://datahubproject.io/docs/generated/ingestion/sources/dbt#integrating-with-dbt-test) + +- [Great Expectations](https://datahubproject.io/docs/metadata-ingestion/integration_docs/great-expectations/) + +**** + +If you opt for a 3rd party tool, it will be your responsibility to ensure the assertions are run based on the Data Contract spec stored in DataHub. With 3rd party runners, you can get the Assertion Change events by subscribing to our Kafka topic using the [DataHub Actions Framework](https://datahubproject.io/docs/actions).  + + +### Acryl Observe + +For Acryl-provided assertion runners, we can deploy an agent in your environment to hit your sources and DataHub. Acryl Observe offers out-of-the-box evaluation of the following kinds of assertions: + +- [Freshness](https://datahubproject.io/docs/managed-datahub/observe/freshness-assertions) (SLAs) + +- [Volume](https://datahubproject.io/docs/managed-datahub/observe/volume-assertions) + +- [Custom SQL](https://datahubproject.io/docs/managed-datahub/observe/custom-sql-assertions) + +- [Column](https://datahubproject.io/docs/managed-datahub/observe/column-assertions) + +**** + +These can be defined through the DataHub API or the UI. With Acryl Observe, you can get the Assertion Change event by getting API events via [AWS EventBridge](https://datahubproject.io/docs/managed-datahub/operator-guide/setting-up-events-api-on-aws-eventbridge/) (the availability and simplicity of setup of each solution dependent on your current Acryl setup – chat with your Acryl representative to learn more). + +**** + +Assertions UI example + +![](https://lh7-us.googleusercontent.com/Cbo_rujT4QBYGzqMhxk7wNsaiWw6_04biMtC4-qPg-WJdP1VvwKOBcwpQg4j34WfWOvHuCmldP7-GUh-v9Y1YVGYyr1A4qzyolqAT7rC7pU1_0RhrtHDRvWiUZIXh9tB92_4rYkSDNCK6eykMb4Vels) + + +## Alerts + +Beyond the ability to see the results of the assertion checks (and history of the results) both on the physical asset’s page in the DataHub UI and as the result of DataHub API calls, you can also get notified via [slack messages](https://datahubproject.io/docs/managed-datahub/saas-slack-setup/) (DMs or to a team channel) based on your [subscription](https://youtu.be/VNNZpkjHG_I?t=79) to an assertion change event. In the future, we’ll also provide the ability to subscribe directly to contracts. + + +## Cost + +We provide a plethora of ways to run your assertions, aiming to allow you to use the cheapest possible means to do so and/or the most accurate means to do so, depending on your use case. For example, for Freshness (SLA) assertions, it is relatively cheap to use either their Audit Log or Information Schema as a means to run freshness checks, and we support both of those as well as Last Modified Column, High Watermark Column, and DataHub Operation ([see the docs for more details](https://datahubproject.io/docs/managed-datahub/observe/freshness-assertions/#3-change-source)). diff --git a/docs/managed-datahub/observe/data-contract.md b/docs/managed-datahub/observe/data-contract.md new file mode 100644 index 00000000000000..4b670e6af31b77 --- /dev/null +++ b/docs/managed-datahub/observe/data-contract.md @@ -0,0 +1,75 @@ +# Data Contracts + +## A Data Contract is… + +- Verifiable : based on the actual physical data asset, not its metadata (eg. schema checks, column-level data checks, and operational SLA-s but not documentation, ownership, and tags). +- A set of assertions : The actual checks against the physical asset to determine a contract’s status (schema, freshness, volume, custom, and column) +- Producer oriented : One contract per physical data asset, owned by the producer.\[collapse section]> Consumer oriented data contractsWe’ve gone with producer-oriented contracts to keep the number of contracts manageable and because we expect consumers to desire a lot of overlap in a given physical asset’s contract. Although, we've heard feedback that consumer-oriented data contracts meet certain needs that producer-oriented contracts do not. For example, having one contract per consumer all on the same physical data asset would allow each consumer to get alerts only when the assertions they care about are violated.We welcome feedback on this in slack!Validated Data Contract example![](https://lh7-us.googleusercontent.com/m2MfgNq5E9t51NjtI-rquwfaRWCeNJgVbcbD6XrU0aC-nwx-gLUo0Td680oq5c5IkCB-se44qReRWeHryaKbYxq7k-fGhJMZMIzRYCsU1gQkpew-zWRx-r7kxN7VoLzXJ0H8_svLp6VTKhUOKHTPkD8)** + +## Data Contract and Assertions + +Another way to word our vision of data contracts is: _A bundle of verifiable assertions on physical data assets representing a public producer commitment._These can be all of the assertions on an asset or only the subset you want publicly promised to consumers. Data Contracts allow you to “promote” a selected group of your assertions as a public promise: if this subset of assertions is not met, the Data Contract is failing.A note on ownership - the owner of the physical data asset is also the owner of the contract and can accept proposed changes and make changes themselves to the contract. + +## How to Create Data Contracts + +Data Contracts can be created via DataHub CLI (YAML), API, or UI. + +### DataHub CLI using YAML + +For creation via CLI, it’s a simple CLI upsert command that you can integrate into your CI/CD system to publish your Data Contracts and any change to them. +1) Define your data contract. +```yaml +# id: sample_data_contract # Optional: if not provided, an id will be generated +entity: urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD) +version: 1 +freshness: + type: cron + cron: "4 8 * * 1-5" +schema: + type: json-schema + json-schema: + type: object + properties: + field_foo: + type: string + native_type: VARCHAR(100) + field_bar: + type: boolean + native_type: boolean + field_documents: + type: array + items: + type: object + properties: + docId: + type: object + properties: + docPolicy: + type: object + properties: + policyId: + type: integer + fileId: + type: integer + required: + - field_bar + - field_documents + + +``` + +### UI + +1. Navigate to the Dataset Profile for the dataset you wish to create a contract for + +2. Under the **Validations** > **Data Contracts** tab, click **Create**.![](https://lh7-us.googleusercontent.com/aOIfU9hAnJA4j_ii1F_qKBezbAxUJil8Y8mq7cr3Le0l5MjNPkt6VXkhJSEtVRU83zBa8fR9lmhPoilhCDjU7x7OZ7vEpP4BUyS3OUWz2M9HG9cv1ROzAjhktbltvz5gLaISvcf2q0DKWsiOnjcEukM) + +3. Select the assertions you wish to be included in the Data Contract. (_Note that when creating a Data Contract via the UI, the Freshness, Schema, and Data Quality assertions are expected to have been created already)_![](https://lh7-us.googleusercontent.com/UpHZ20vpUJAPgS6Jos1zviSu1nLkZl7a1s40Zd3HE3GeN2lNbom37sJCK4K_q4O-BUmwdUk2sS35PRu_ZnmKgjwDlUTdIh3aXFIXv1mydZm9BRVOeMwnfsFWDfFOovQzizgv6M3ZMa-IfjEyS8sGdXc) + +4. See it in the UI![](https://lh7-us.googleusercontent.com/hoDoFEuHmTC8M1LhgbwvE6FWn9F0s3N-JZFlmDouALVlsOjHBcwLnCAPHD2gtLAb5n83-FRA0oqw1FIoWyLmOx7RyejukGJPqdqHwmzUPhnisYC8NXqdeVyP8SqdwtUYTBDMsw2C0N_PO-w908geWZQ)** + +### API + +Coming soon. + +## How to Run Data Contracts From 49fd0503dc66434ae011af53cc928fcd6f169280 Mon Sep 17 00:00:00 2001 From: yoonhyejin <0327jane@gmail.com> Date: Tue, 12 Mar 2024 15:33:04 +0900 Subject: [PATCH 2/5] format: apply yarnlint --- docs-website/sidebars.js | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index fb9c24196b6c80..86b93de8877960 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -96,14 +96,14 @@ module.exports = { id: "docs/managed-datahub/observe/column-assertions", className: "saasOnly", }, - ], + ], }, { - type: "doc", - id: "docs/managed-datahub/observe/data-contract", - className: "saasOnly", + type: "doc", + id: "docs/managed-datahub/observe/data-contract", + className: "saasOnly", }, - ], + ], }, { Guides: ["docs/features/feature-guides/ui-lineage"], From 1f31bfd3e8f1f4d22bbf51f0e623e5a6582c5b2e Mon Sep 17 00:00:00 2001 From: yoonhyejin <0327jane@gmail.com> Date: Thu, 14 Mar 2024 07:07:57 +0900 Subject: [PATCH 3/5] fix: formatting issue --- docs/managed-datahub/observe/assertions.md | 6 ------ docs/managed-datahub/observe/data-contract.md | 3 --- 2 files changed, 9 deletions(-) diff --git a/docs/managed-datahub/observe/assertions.md b/docs/managed-datahub/observe/assertions.md index 54b4b3c9b56d91..18690153a59f4a 100644 --- a/docs/managed-datahub/observe/assertions.md +++ b/docs/managed-datahub/observe/assertions.md @@ -7,18 +7,15 @@ _Note: currently we support Snowflake, Databricks, Redshift, and BigQuery for ou An assertion is a data quality test that finds data that violate one or more specified rules. These serve as the backbone of Data Contracts – this is how we verify the contract is met.  - ## How to Create and Run Assertions Data quality tests (a.k.a. assertions) can be run by either Acryl or ingested from a 3rd party tool.  - ### 3rd Party Runners You can integrate 3rd party tools as follows: - [DBT Test](https://datahubproject.io/docs/generated/ingestion/sources/dbt#integrating-with-dbt-test) - - [Great Expectations](https://datahubproject.io/docs/metadata-ingestion/integration_docs/great-expectations/) **** @@ -31,11 +28,8 @@ If you opt for a 3rd party tool, it will be your responsibility to ensure the as For Acryl-provided assertion runners, we can deploy an agent in your environment to hit your sources and DataHub. Acryl Observe offers out-of-the-box evaluation of the following kinds of assertions: - [Freshness](https://datahubproject.io/docs/managed-datahub/observe/freshness-assertions) (SLAs) - - [Volume](https://datahubproject.io/docs/managed-datahub/observe/volume-assertions) - - [Custom SQL](https://datahubproject.io/docs/managed-datahub/observe/custom-sql-assertions) - - [Column](https://datahubproject.io/docs/managed-datahub/observe/column-assertions) **** diff --git a/docs/managed-datahub/observe/data-contract.md b/docs/managed-datahub/observe/data-contract.md index 4b670e6af31b77..f139ee40a3cfb6 100644 --- a/docs/managed-datahub/observe/data-contract.md +++ b/docs/managed-datahub/observe/data-contract.md @@ -61,11 +61,8 @@ schema: ### UI 1. Navigate to the Dataset Profile for the dataset you wish to create a contract for - 2. Under the **Validations** > **Data Contracts** tab, click **Create**.![](https://lh7-us.googleusercontent.com/aOIfU9hAnJA4j_ii1F_qKBezbAxUJil8Y8mq7cr3Le0l5MjNPkt6VXkhJSEtVRU83zBa8fR9lmhPoilhCDjU7x7OZ7vEpP4BUyS3OUWz2M9HG9cv1ROzAjhktbltvz5gLaISvcf2q0DKWsiOnjcEukM) - 3. Select the assertions you wish to be included in the Data Contract. (_Note that when creating a Data Contract via the UI, the Freshness, Schema, and Data Quality assertions are expected to have been created already)_![](https://lh7-us.googleusercontent.com/UpHZ20vpUJAPgS6Jos1zviSu1nLkZl7a1s40Zd3HE3GeN2lNbom37sJCK4K_q4O-BUmwdUk2sS35PRu_ZnmKgjwDlUTdIh3aXFIXv1mydZm9BRVOeMwnfsFWDfFOovQzizgv6M3ZMa-IfjEyS8sGdXc) - 4. See it in the UI![](https://lh7-us.googleusercontent.com/hoDoFEuHmTC8M1LhgbwvE6FWn9F0s3N-JZFlmDouALVlsOjHBcwLnCAPHD2gtLAb5n83-FRA0oqw1FIoWyLmOx7RyejukGJPqdqHwmzUPhnisYC8NXqdeVyP8SqdwtUYTBDMsw2C0N_PO-w908geWZQ)** ### API From 20e44f1b015c0f820f0281f1b9f746c9c42bcf5b Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Thu, 14 Mar 2024 16:04:52 +0900 Subject: [PATCH 4/5] fix: reflect updates --- docs-website/sidebars.js | 1 - docs/managed-datahub/observe/assertions.md | 36 ++-- docs/managed-datahub/observe/data-contract.md | 158 +++++++++++++----- 3 files changed, 137 insertions(+), 58 deletions(-) diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 86b93de8877960..d1a8d6ec08581f 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -101,7 +101,6 @@ module.exports = { { type: "doc", id: "docs/managed-datahub/observe/data-contract", - className: "saasOnly", }, ], }, diff --git a/docs/managed-datahub/observe/assertions.md b/docs/managed-datahub/observe/assertions.md index 18690153a59f4a..64af7140cc8593 100644 --- a/docs/managed-datahub/observe/assertions.md +++ b/docs/managed-datahub/observe/assertions.md @@ -1,53 +1,53 @@ # Assertions -_Note: currently we support Snowflake, Databricks, Redshift, and BigQuery for out-of-the-box contract monitoring as part of Acryl Observe._ +:::note Contract Monitoring Support +Currently we support Snowflake, Databricks, Redshift, and BigQuery for out-of-the-box contract monitoring as part of Acryl Observe. +::: ## What is an Assertion -An assertion is a data quality test that finds data that violate one or more specified rules. These serve as the backbone of Data Contracts – this is how we verify the contract is met.  +An assertion is **a data quality test that finds data that violate one or more specified rules.** These serve as the backbone of [Data Contracts](/docs/managed-datahub/observe/data-contract.md) – this is how we verify the contract is met. ## How to Create and Run Assertions -Data quality tests (a.k.a. assertions) can be run by either Acryl or ingested from a 3rd party tool.  +Data quality tests (a.k.a. assertions) can be ingested from a 3rd party tool or run by Acryl. ### 3rd Party Runners You can integrate 3rd party tools as follows: -- [DBT Test](https://datahubproject.io/docs/generated/ingestion/sources/dbt#integrating-with-dbt-test) -- [Great Expectations](https://datahubproject.io/docs/metadata-ingestion/integration_docs/great-expectations/) +- [DBT Test](/docs/generated/ingestion/sources/dbt.md#integrating-with-dbt-test) +- [Great Expectations](../../../metadata-ingestion/integration_docs/great-expectations.md) -**** -If you opt for a 3rd party tool, it will be your responsibility to ensure the assertions are run based on the Data Contract spec stored in DataHub. With 3rd party runners, you can get the Assertion Change events by subscribing to our Kafka topic using the [DataHub Actions Framework](https://datahubproject.io/docs/actions).  +If you opt for a 3rd party tool, it will be your responsibility to ensure the assertions are run based on the Data Contract spec stored in DataHub. With 3rd party runners, you can get the Assertion Change events by subscribing to our Kafka topic using the [DataHub Actions Framework](/docs/actions/README.md). ### Acryl Observe For Acryl-provided assertion runners, we can deploy an agent in your environment to hit your sources and DataHub. Acryl Observe offers out-of-the-box evaluation of the following kinds of assertions: -- [Freshness](https://datahubproject.io/docs/managed-datahub/observe/freshness-assertions) (SLAs) -- [Volume](https://datahubproject.io/docs/managed-datahub/observe/volume-assertions) -- [Custom SQL](https://datahubproject.io/docs/managed-datahub/observe/custom-sql-assertions) -- [Column](https://datahubproject.io/docs/managed-datahub/observe/column-assertions) +- [Freshness](/docs/managed-datahub/observe/freshness-assertions.md) (SLAs) +- [Volume](/docs/managed-datahub/observe/volume-assertions.md) +- [Custom SQL](/docs/managed-datahub/observe/custom-sql-assertions.md) +- [Column](/docs/managed-datahub/observe/column-assertions.md) -**** -These can be defined through the DataHub API or the UI. With Acryl Observe, you can get the Assertion Change event by getting API events via [AWS EventBridge](https://datahubproject.io/docs/managed-datahub/operator-guide/setting-up-events-api-on-aws-eventbridge/) (the availability and simplicity of setup of each solution dependent on your current Acryl setup – chat with your Acryl representative to learn more). +These can be defined through the DataHub API or the UI. With Acryl Observe, you can get the Assertion Change event by getting API events via [AWS EventBridge](/docs/managed-datahub/operator-guide/setting-up-events-api-on-aws-eventbridge.md) (the availability and simplicity of setup of each solution dependent on your current Acryl setup – chat with your Acryl representative to learn more). -**** -Assertions UI example -![](https://lh7-us.googleusercontent.com/Cbo_rujT4QBYGzqMhxk7wNsaiWw6_04biMtC4-qPg-WJdP1VvwKOBcwpQg4j34WfWOvHuCmldP7-GUh-v9Y1YVGYyr1A4qzyolqAT7rC7pU1_0RhrtHDRvWiUZIXh9tB92_4rYkSDNCK6eykMb4Vels) +

+ +

## Alerts -Beyond the ability to see the results of the assertion checks (and history of the results) both on the physical asset’s page in the DataHub UI and as the result of DataHub API calls, you can also get notified via [slack messages](https://datahubproject.io/docs/managed-datahub/saas-slack-setup/) (DMs or to a team channel) based on your [subscription](https://youtu.be/VNNZpkjHG_I?t=79) to an assertion change event. In the future, we’ll also provide the ability to subscribe directly to contracts. +Beyond the ability to see the results of the assertion checks (and history of the results) both on the physical asset’s page in the DataHub UI and as the result of DataHub API calls, you can also get notified via [slack messages](/docs/managed-datahub/saas-slack-setup.md) (DMs or to a team channel) based on your [subscription](https://youtu.be/VNNZpkjHG_I?t=79) to an assertion change event. In the future, we’ll also provide the ability to subscribe directly to contracts. ## Cost -We provide a plethora of ways to run your assertions, aiming to allow you to use the cheapest possible means to do so and/or the most accurate means to do so, depending on your use case. For example, for Freshness (SLA) assertions, it is relatively cheap to use either their Audit Log or Information Schema as a means to run freshness checks, and we support both of those as well as Last Modified Column, High Watermark Column, and DataHub Operation ([see the docs for more details](https://datahubproject.io/docs/managed-datahub/observe/freshness-assertions/#3-change-source)). +We provide a plethora of ways to run your assertions, aiming to allow you to use the cheapest possible means to do so and/or the most accurate means to do so, depending on your use case. For example, for Freshness (SLA) assertions, it is relatively cheap to use either their Audit Log or Information Schema as a means to run freshness checks, and we support both of those as well as Last Modified Column, High Watermark Column, and DataHub Operation ([see the docs for more details](/docs/managed-datahub/observe/freshness-assertions.md#3-change-source)). diff --git a/docs/managed-datahub/observe/data-contract.md b/docs/managed-datahub/observe/data-contract.md index f139ee40a3cfb6..3619efb4a9106c 100644 --- a/docs/managed-datahub/observe/data-contract.md +++ b/docs/managed-datahub/observe/data-contract.md @@ -1,14 +1,34 @@ # Data Contracts -## A Data Contract is… +## What Is a Data Contract -- Verifiable : based on the actual physical data asset, not its metadata (eg. schema checks, column-level data checks, and operational SLA-s but not documentation, ownership, and tags). -- A set of assertions : The actual checks against the physical asset to determine a contract’s status (schema, freshness, volume, custom, and column) -- Producer oriented : One contract per physical data asset, owned by the producer.\[collapse section]> Consumer oriented data contractsWe’ve gone with producer-oriented contracts to keep the number of contracts manageable and because we expect consumers to desire a lot of overlap in a given physical asset’s contract. Although, we've heard feedback that consumer-oriented data contracts meet certain needs that producer-oriented contracts do not. For example, having one contract per consumer all on the same physical data asset would allow each consumer to get alerts only when the assertions they care about are violated.We welcome feedback on this in slack!Validated Data Contract example![](https://lh7-us.googleusercontent.com/m2MfgNq5E9t51NjtI-rquwfaRWCeNJgVbcbD6XrU0aC-nwx-gLUo0Td680oq5c5IkCB-se44qReRWeHryaKbYxq7k-fGhJMZMIzRYCsU1gQkpew-zWRx-r7kxN7VoLzXJ0H8_svLp6VTKhUOKHTPkD8)** +The definition of a Data Contract is consisted of the following: + +- **Verifiable** : based on the actual physical data asset, not its metadata (e.g., schema checks, column-level data checks, and operational SLA-s but not documentation, ownership, and tags). +- **A set of assertions** : The actual checks against the physical asset to determine a contract’s status (schema, freshness, volume, custom, and column) +- **Producer oriented** : One contract per physical data asset, owned by the producer. + + +
+Consumer Oriented Data contracts +We’ve gone with producer-oriented contracts to keep the number of contracts manageable and because we expect consumers to desire a lot of overlap in a given physical asset’s contract. Although, we've heard feedback that consumer-oriented data contracts meet certain needs that producer-oriented contracts do not. For example, having one contract per consumer all on the same physical data asset would allow each consumer to get alerts only when the assertions they care about are violated.We welcome feedback on this in slack! +
+ +

+ +

## Data Contract and Assertions -Another way to word our vision of data contracts is: _A bundle of verifiable assertions on physical data assets representing a public producer commitment._These can be all of the assertions on an asset or only the subset you want publicly promised to consumers. Data Contracts allow you to “promote” a selected group of your assertions as a public promise: if this subset of assertions is not met, the Data Contract is failing.A note on ownership - the owner of the physical data asset is also the owner of the contract and can accept proposed changes and make changes themselves to the contract. +Another way to word our vision of data contracts is **A bundle of verifiable assertions on physical data assets representing a public producer commitment.** +These can be all the assertions on an asset or only the subset you want publicly promised to consumers. Data Contracts allow you to **promote a selected group of your assertions** as a public promise: if this subset of assertions is not met, the Data Contract is failing. + +See docs on [assertions](/docs/managed-datahub/observe/assertions.md) for more details on the types of assertions and how to create and run them. + +:::note Ownership +the owner of the physical data asset is also the owner of the contract and can accept proposed changes and make changes themselves to the contract. +::: + ## How to Create Data Contracts @@ -17,7 +37,8 @@ Data Contracts can be created via DataHub CLI (YAML), API, or UI. ### DataHub CLI using YAML For creation via CLI, it’s a simple CLI upsert command that you can integrate into your CI/CD system to publish your Data Contracts and any change to them. -1) Define your data contract. + +1. Define your data contract. ```yaml # id: sample_data_contract # Optional: if not provided, an id will be generated entity: urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD) @@ -25,48 +46,107 @@ version: 1 freshness: type: cron cron: "4 8 * * 1-5" -schema: - type: json-schema - json-schema: - type: object - properties: - field_foo: - type: string - native_type: VARCHAR(100) - field_bar: - type: boolean - native_type: boolean - field_documents: - type: array - items: - type: object - properties: - docId: - type: object - properties: - docPolicy: - type: object - properties: - policyId: - type: integer - fileId: - type: integer - required: - - field_bar - - field_documents +data_quality: + - type: unique + column: field_foo +## here's an example of how you'd define the schema +# schema: +# type: json-schema +# json-schema: +# type: object +# properties: +# field_foo: +# type: string +# native_type: VARCHAR(100) +# field_bar: +# type: boolean +# native_type: boolean +# field_documents: +# type: array +# items: +# type: object +# properties: +# docId: +# type: object +# properties: +# docPolicy: +# type: object +# properties: +# policyId: +# type: integer +# fileId: +# type: integer +# required: +# - field_bar +# - field_documents +``` +2. Use the CLI to create the contract by running the below command. +```shell +datahub datacontract upsert -f contract_definition.yml ``` +3. Now you can see your contract on the UI. + +

+ +

+ + ### UI 1. Navigate to the Dataset Profile for the dataset you wish to create a contract for -2. Under the **Validations** > **Data Contracts** tab, click **Create**.![](https://lh7-us.googleusercontent.com/aOIfU9hAnJA4j_ii1F_qKBezbAxUJil8Y8mq7cr3Le0l5MjNPkt6VXkhJSEtVRU83zBa8fR9lmhPoilhCDjU7x7OZ7vEpP4BUyS3OUWz2M9HG9cv1ROzAjhktbltvz5gLaISvcf2q0DKWsiOnjcEukM) -3. Select the assertions you wish to be included in the Data Contract. (_Note that when creating a Data Contract via the UI, the Freshness, Schema, and Data Quality assertions are expected to have been created already)_![](https://lh7-us.googleusercontent.com/UpHZ20vpUJAPgS6Jos1zviSu1nLkZl7a1s40Zd3HE3GeN2lNbom37sJCK4K_q4O-BUmwdUk2sS35PRu_ZnmKgjwDlUTdIh3aXFIXv1mydZm9BRVOeMwnfsFWDfFOovQzizgv6M3ZMa-IfjEyS8sGdXc) -4. See it in the UI![](https://lh7-us.googleusercontent.com/hoDoFEuHmTC8M1LhgbwvE6FWn9F0s3N-JZFlmDouALVlsOjHBcwLnCAPHD2gtLAb5n83-FRA0oqw1FIoWyLmOx7RyejukGJPqdqHwmzUPhnisYC8NXqdeVyP8SqdwtUYTBDMsw2C0N_PO-w908geWZQ)** +2. Under the **Validations** > **Data Contracts** tab, click **Create**. + +

+ +

+ + +3. Select the assertions you wish to be included in the Data Contract. + +

+ +

+ + +:::note Create Data Contracts via UI +Please note that when creating a Data Contract via UI, the Freshness, Schema, and Data Quality assertions are expected to have been created already. +::: +4. Now you can see it in the UI. + +

+ +

+ ### API -Coming soon. +:::note +API guide on creating data contract is coming soon! +::: ## How to Run Data Contracts + +Running Data Contracts is dependent on running the contract’s assertions and getting the results on Datahub. Using Acryl Observe (available on SAAS), you can schedule assertions on Datahub itself. Otherwise, you can run your assertions outside of Datahub and have the results published back to Datahub. + +Datahub integrates nicely with DBT Test and Great Expectations, as described below. For other 3rd party assertion runners, you’ll need to use our APIs to publish the assertion results back to our platform. + +### DBT Test + +During DBT Ingestion, we pick up the dbt `run_results` file, which contains the dbt test run results, and translate it into assertion runs. [See details here.](/docs/generated/ingestion/sources/dbt.md#module-dbt) + +

+ +

+ + + +### Great Expectations + +For Great Expectations, you can integrate the **DataHubValidationAction** directly into your Great Expectations Checkpoint in order to have the assertion (aka. expectation) results to Datahub. [See the guide here](../../../metadata-ingestion/integration_docs/great-expectations.md). + +

+ +

From 1837653c26439d76adb44ee740df3437d33ac8ba Mon Sep 17 00:00:00 2001 From: yoonhyejin <0327jane@gmail.com> Date: Fri, 15 Mar 2024 14:36:32 +0900 Subject: [PATCH 5/5] fix: reflect pr review --- docs/managed-datahub/observe/assertions.md | 35 +++++------ docs/managed-datahub/observe/data-contract.md | 59 ++++--------------- .../examples/library/create_data_contract.yml | 39 ++++++++++++ 3 files changed, 67 insertions(+), 66 deletions(-) create mode 100644 metadata-ingestion/examples/library/create_data_contract.yml diff --git a/docs/managed-datahub/observe/assertions.md b/docs/managed-datahub/observe/assertions.md index 64af7140cc8593..f6d47ebfb30e2d 100644 --- a/docs/managed-datahub/observe/assertions.md +++ b/docs/managed-datahub/observe/assertions.md @@ -4,25 +4,12 @@ Currently we support Snowflake, Databricks, Redshift, and BigQuery for out-of-the-box contract monitoring as part of Acryl Observe. ::: - -## What is an Assertion - -An assertion is **a data quality test that finds data that violate one or more specified rules.** These serve as the backbone of [Data Contracts](/docs/managed-datahub/observe/data-contract.md) – this is how we verify the contract is met. +An assertion is **a data quality test that finds data that violates a specified rule.** +Assertions serve as the building blocks of [Data Contracts](/docs/managed-datahub/observe/data-contract.md) – this is how we verify the contract is met. ## How to Create and Run Assertions -Data quality tests (a.k.a. assertions) can be ingested from a 3rd party tool or run by Acryl. - -### 3rd Party Runners - -You can integrate 3rd party tools as follows: - -- [DBT Test](/docs/generated/ingestion/sources/dbt.md#integrating-with-dbt-test) -- [Great Expectations](../../../metadata-ingestion/integration_docs/great-expectations.md) - - -If you opt for a 3rd party tool, it will be your responsibility to ensure the assertions are run based on the Data Contract spec stored in DataHub. With 3rd party runners, you can get the Assertion Change events by subscribing to our Kafka topic using the [DataHub Actions Framework](/docs/actions/README.md). - +Data quality tests (a.k.a. assertions) can be created and run by Acryl or ingested from a 3rd party tool. ### Acryl Observe @@ -33,20 +20,28 @@ For Acryl-provided assertion runners, we can deploy an agent in your environment - [Custom SQL](/docs/managed-datahub/observe/custom-sql-assertions.md) - [Column](/docs/managed-datahub/observe/column-assertions.md) - -These can be defined through the DataHub API or the UI. With Acryl Observe, you can get the Assertion Change event by getting API events via [AWS EventBridge](/docs/managed-datahub/operator-guide/setting-up-events-api-on-aws-eventbridge.md) (the availability and simplicity of setup of each solution dependent on your current Acryl setup – chat with your Acryl representative to learn more). - - +These can be defined through the DataHub API or the UI.

+### 3rd Party Runners + +You can integrate 3rd party tools as follows: + +- [DBT Test](/docs/generated/ingestion/sources/dbt.md#integrating-with-dbt-test) +- [Great Expectations](../../../metadata-ingestion/integration_docs/great-expectations.md) + +If you opt for a 3rd party tool, it will be your responsibility to ensure the assertions are run based on the Data Contract spec stored in DataHub. With 3rd party runners, you can get the Assertion Change events by subscribing to our Kafka topic using the [DataHub Actions Framework](/docs/actions/README.md). + ## Alerts Beyond the ability to see the results of the assertion checks (and history of the results) both on the physical asset’s page in the DataHub UI and as the result of DataHub API calls, you can also get notified via [slack messages](/docs/managed-datahub/saas-slack-setup.md) (DMs or to a team channel) based on your [subscription](https://youtu.be/VNNZpkjHG_I?t=79) to an assertion change event. In the future, we’ll also provide the ability to subscribe directly to contracts. +With Acryl Observe, you can get the Assertion Change event by getting API events via [AWS EventBridge](/docs/managed-datahub/operator-guide/setting-up-events-api-on-aws-eventbridge.md) (the availability and simplicity of setup of each solution dependent on your current Acryl setup – chat with your Acryl representative to learn more). + ## Cost diff --git a/docs/managed-datahub/observe/data-contract.md b/docs/managed-datahub/observe/data-contract.md index 3619efb4a9106c..31bb414b4bf69f 100644 --- a/docs/managed-datahub/observe/data-contract.md +++ b/docs/managed-datahub/observe/data-contract.md @@ -2,7 +2,10 @@ ## What Is a Data Contract -The definition of a Data Contract is consisted of the following: +A Data Contract is **an agreement between a data asset's producer and consumer**, serving as a promise about the quality of the data. +It often includes [assertions](assertions.md) about the data’s schema, freshness, and data quality. + +Some of the key characteristics of a Data Contract are: - **Verifiable** : based on the actual physical data asset, not its metadata (e.g., schema checks, column-level data checks, and operational SLA-s but not documentation, ownership, and tags). - **A set of assertions** : The actual checks against the physical asset to determine a contract’s status (schema, freshness, volume, custom, and column) @@ -14,6 +17,8 @@ The definition of a Data Contract is consisted of the following: We’ve gone with producer-oriented contracts to keep the number of contracts manageable and because we expect consumers to desire a lot of overlap in a given physical asset’s contract. Although, we've heard feedback that consumer-oriented data contracts meet certain needs that producer-oriented contracts do not. For example, having one contract per consumer all on the same physical data asset would allow each consumer to get alerts only when the assertions they care about are violated.We welcome feedback on this in slack! +Below is a screenshot of the Data Contracts UI in DataHub. +

@@ -26,7 +31,7 @@ These can be all the assertions on an asset or only the subset you want publicly See docs on [assertions](/docs/managed-datahub/observe/assertions.md) for more details on the types of assertions and how to create and run them. :::note Ownership -the owner of the physical data asset is also the owner of the contract and can accept proposed changes and make changes themselves to the contract. +The owner of the physical data asset is also the owner of the contract and can accept proposed changes and make changes themselves to the contract. ::: @@ -39,48 +44,11 @@ Data Contracts can be created via DataHub CLI (YAML), API, or UI. For creation via CLI, it’s a simple CLI upsert command that you can integrate into your CI/CD system to publish your Data Contracts and any change to them. 1. Define your data contract. -```yaml -# id: sample_data_contract # Optional: if not provided, an id will be generated -entity: urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD) -version: 1 -freshness: - type: cron - cron: "4 8 * * 1-5" -data_quality: - - type: unique - column: field_foo -## here's an example of how you'd define the schema -# schema: -# type: json-schema -# json-schema: -# type: object -# properties: -# field_foo: -# type: string -# native_type: VARCHAR(100) -# field_bar: -# type: boolean -# native_type: boolean -# field_documents: -# type: array -# items: -# type: object -# properties: -# docId: -# type: object -# properties: -# docPolicy: -# type: object -# properties: -# policyId: -# type: integer -# fileId: -# type: integer -# required: -# - field_bar -# - field_documents +```yaml +{{ inline /metadata-ingestion/examples/library/create_data_contract.yml show_path_as_comment }} ``` + 2. Use the CLI to create the contract by running the below command. ```shell @@ -112,7 +80,7 @@ datahub datacontract upsert -f contract_definition.yml :::note Create Data Contracts via UI -Please note that when creating a Data Contract via UI, the Freshness, Schema, and Data Quality assertions are expected to have been created already. +When creating a Data Contract via UI, the Freshness, Schema, and Data Quality assertions must be created first. ::: 4. Now you can see it in the UI. @@ -123,9 +91,8 @@ Please note that when creating a Data Contract via UI, the Freshness, Schema, an ### API -:::note -API guide on creating data contract is coming soon! -::: +_API guide on creating data contract is coming soon!_ + ## How to Run Data Contracts diff --git a/metadata-ingestion/examples/library/create_data_contract.yml b/metadata-ingestion/examples/library/create_data_contract.yml new file mode 100644 index 00000000000000..774b3ffd7ebb28 --- /dev/null +++ b/metadata-ingestion/examples/library/create_data_contract.yml @@ -0,0 +1,39 @@ +# id: sample_data_contract # Optional: if not provided, an id will be generated +entity: urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD) +version: 1 +freshness: + type: cron + cron: "4 8 * * 1-5" +data_quality: + - type: unique + column: field_foo +## here's an example of how you'd define the schema +# schema: +# type: json-schema +# json-schema: +# type: object +# properties: +# field_foo: +# type: string +# native_type: VARCHAR(100) +# field_bar: +# type: boolean +# native_type: boolean +# field_documents: +# type: array +# items: +# type: object +# properties: +# docId: +# type: object +# properties: +# docPolicy: +# type: object +# properties: +# policyId: +# type: integer +# fileId: +# type: integer +# required: +# - field_bar +# - field_documents