diff --git a/docs-website/build.gradle b/docs-website/build.gradle index b5edf446a6c6e6..5284b7063c2091 100644 --- a/docs-website/build.gradle +++ b/docs-website/build.gradle @@ -81,6 +81,19 @@ task yarnLint(type: YarnTask, dependsOn: [yarnInstall]) { outputs.cacheIf { true } } +task yarnLintFix(type: YarnTask, dependsOn: [yarnInstall]) { + inputs.files(projectMdFiles) + args = ['run', 'lint-fix'] + outputs.dir("dist") + // tell gradle to apply the build cache + outputs.cacheIf { true } +} + +task serve(type: YarnTask, dependsOn: [yarnInstall] ) { + args = ['run', 'serve'] +} + + task yarnBuild(type: YarnTask, dependsOn: [yarnLint, yarnGenerate]) { inputs.files(projectMdFiles) inputs.file("package.json").withPathSensitivity(PathSensitivity.RELATIVE) diff --git a/docs-website/package.json b/docs-website/package.json index 0c5cb968adf251..7a9675eb1662f1 100644 --- a/docs-website/package.json +++ b/docs-website/package.json @@ -12,7 +12,8 @@ "clear": "docusaurus clear && rm -rf genDocs/*", "generate": "rm -rf genDocs/* && ts-node -O '{ \"lib\": [\"es2020\"], \"target\": \"es6\" }' generateDocsDir.ts && mv -v docs/* genDocs/", "lint": "prettier -w generateDocsDir.ts sidebars.js src/pages/index.js", - "lint-check": "prettier -l generateDocsDir.ts sidebars.js src/pages/index.js" + "lint-check": "prettier -l generateDocsDir.ts sidebars.js src/pages/index.js", + "lint-fix": "prettier --write generateDocsDir.ts sidebars.js src/pages/index.js" }, "dependencies": { "@docusaurus/core": "^2.0.0-beta.7", @@ -45,4 +46,4 @@ "ts-node": "^9.1.1", "typescript": "^4.1.5" } -} +} \ No newline at end of file diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index ec66aaa32837ac..069514ee0a61a0 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -62,18 +62,13 @@ module.exports = { "docs/saas", "releases", ], - "Getting Started": [ - "docs/quickstart", - "docs/cli", - "metadata-ingestion/README", - "docs/debugging", - ], + "Getting Started": ["docs/quickstart", "docs/cli", "docs/debugging"], "Metadata Ingestion": [ // add a custom label since the default is 'Metadata Ingestion' // note that we also have to add the path to this file in sidebarsjs_hardcoded_titles in generateDocsDir.ts { type: "doc", - label: "Quickstart", + label: "Introduction", id: "metadata-ingestion/README", }, { diff --git a/docs/cli.md b/docs/cli.md index 398dbfe51a8996..732cf19b221844 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -9,7 +9,7 @@ You can find the release notes in [github releases](https://github.com/linkedin/ ## Installation ### Using pip -We recommend python virtual environments (venv-s) to namespace pip modules. Here's an example setup: +We recommend python virtual environments (venv-s) to namespace pip modules. The folks over at [Acryl Data](https://www.acryl.io/) maintain a PyPI package for DataHub metadata ingestion. Here's an example setup: ```shell python3 -m venv datahub-env # create the environment @@ -20,7 +20,7 @@ source datahub-env/bin/activate # activate the environment Once inside the virtual environment, install `datahub` using the following commands -```console +```shell # Requires Python 3.6+ python3 -m pip install --upgrade pip wheel setuptools python3 -m pip install --upgrade acryl-datahub @@ -32,8 +32,93 @@ If you run into an error, try checking the [_common setup issues_](../metadata-i ### Using docker +[![Docker Hub](https://img.shields.io/docker/pulls/linkedin/datahub-ingestion?style=plastic)](https://hub.docker.com/r/linkedin/datahub-ingestion) +[![datahub-ingestion docker](https://github.com/linkedin/datahub/actions/workflows/docker-ingestion.yml/badge.svg)](https://github.com/linkedin/datahub/actions/workflows/docker-ingestion.yml) + +If you don't want to install locally, you can alternatively run metadata ingestion within a Docker container. +We have prebuilt images available on [Docker hub](https://hub.docker.com/r/linkedin/datahub-ingestion). All plugins will be installed and enabled automatically. + You can use the `datahub-ingestion` docker image as explained in [Docker Images](../docker/README.md). In case you are using Kubernetes you can start a pod with the `datahub-ingestion` docker image, log onto a shell on the pod and you should have the access to datahub CLI in your kubernetes cluster. +_Limitation: the datahub_docker.sh convenience script assumes that the recipe and any input/output files are accessible in the current working directory or its subdirectories. Files outside the current working directory will not be found, and you'll need to invoke the Docker image directly._ + +```shell +# Assumes the DataHub repo is cloned locally. +./metadata-ingestion/scripts/datahub_docker.sh ingest -c ./examples/recipes/example_to_datahub_rest.yml +``` + +### Install from source + +If you'd like to install from source, see the [developer guide](../metadata-ingestion/developing.md). + +## Installing Plugins + +We use a plugin architecture so that you can install only the dependencies you actually need. Click the plugin name to learn more about the specific source recipe and any FAQs! + +### Sources + +| Plugin Name | Install Command | Provides | +|-----------------------------------------------------------------|------------------------------------------------------------| ----------------------------------- | +| [file](../metadata-ingestion/source_docs/file.md) | _included by default_ | File source and sink | +| [athena](../metadata-ingestion/source_docs/athena.md) | `pip install 'acryl-datahub[athena]'` | AWS Athena source | +| [bigquery](../metadata-ingestion/source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery]'` | BigQuery source | +| [bigquery-usage](../metadata-ingestion/source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery-usage]'` | BigQuery usage statistics source | +| [datahub-business-glossary](../metadata-ingestion/source_docs/business_glossary.md) | _no additional dependencies_ | Business Glossary File source | +| [dbt](../metadata-ingestion/source_docs/dbt.md) | _no additional dependencies_ | dbt source | +| [druid](../metadata-ingestion/source_docs/druid.md) | `pip install 'acryl-datahub[druid]'` | Druid Source | +| [feast](../metadata-ingestion/source_docs/feast.md) | `pip install 'acryl-datahub[feast]'` | Feast source | +| [glue](../metadata-ingestion/source_docs/glue.md) | `pip install 'acryl-datahub[glue]'` | AWS Glue source | +| [hive](../metadata-ingestion/source_docs/hive.md) | `pip install 'acryl-datahub[hive]'` | Hive source | +| [kafka](../metadata-ingestion/source_docs/kafka.md) | `pip install 'acryl-datahub[kafka]'` | Kafka source | +| [kafka-connect](../metadata-ingestion/source_docs/kafka-connect.md) | `pip install 'acryl-datahub[kafka-connect]'` | Kafka connect source | +| [ldap](../metadata-ingestion/source_docs/ldap.md) | `pip install 'acryl-datahub[ldap]'` ([extra requirements]) | LDAP source | +| [looker](../metadata-ingestion/source_docs/looker.md) | `pip install 'acryl-datahub[looker]'` | Looker source | +| [lookml](../metadata-ingestion/source_docs/lookml.md) | `pip install 'acryl-datahub[lookml]'` | LookML source, requires Python 3.7+ | +| [metabase](../metadata-ingestion/source_docs/metabase.md) | `pip install 'acryl-datahub[metabase]` | Metabase source | +| [mode](../metadata-ingestion/source_docs/mode.md) | `pip install 'acryl-datahub[mode]'` | Mode Analytics source | +| [mongodb](../metadata-ingestion/source_docs/mongodb.md) | `pip install 'acryl-datahub[mongodb]'` | MongoDB source | +| [mssql](../metadata-ingestion/source_docs/mssql.md) | `pip install 'acryl-datahub[mssql]'` | SQL Server source | +| [mysql](../metadata-ingestion/source_docs/mysql.md) | `pip install 'acryl-datahub[mysql]'` | MySQL source | +| [mariadb](../metadata-ingestion/source_docs/mariadb.md) | `pip install 'acryl-datahub[mariadb]'` | MariaDB source | +| [openapi](../metadata-ingestion/source_docs/openapi.md) | `pip install 'acryl-datahub[openapi]'` | OpenApi Source | +| [oracle](../metadata-ingestion/source_docs/oracle.md) | `pip install 'acryl-datahub[oracle]'` | Oracle source | +| [postgres](../metadata-ingestion/source_docs/postgres.md) | `pip install 'acryl-datahub[postgres]'` | Postgres source | +| [redash](../metadata-ingestion/source_docs/redash.md) | `pip install 'acryl-datahub[redash]'` | Redash source | +| [redshift](../metadata-ingestion/source_docs/redshift.md) | `pip install 'acryl-datahub[redshift]'` | Redshift source | +| [sagemaker](../metadata-ingestion/source_docs/sagemaker.md) | `pip install 'acryl-datahub[sagemaker]'` | AWS SageMaker source | +| [snowflake](../metadata-ingestion/source_docs/snowflake.md) | `pip install 'acryl-datahub[snowflake]'` | Snowflake source | +| [snowflake-usage](../metadata-ingestion/source_docs/snowflake.md) | `pip install 'acryl-datahub[snowflake-usage]'` | Snowflake usage statistics source | +| [sql-profiles](../metadata-ingestion/source_docs/sql_profiles.md) | `pip install 'acryl-datahub[sql-profiles]'` | Data profiles for SQL-based systems | +| [sqlalchemy](../metadata-ingestion/source_docs/sqlalchemy.md) | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source | +| [superset](../metadata-ingestion/source_docs/superset.md) | `pip install 'acryl-datahub[superset]'` | Superset source | +| [tableau](../metadata-ingestion/source_docs/tableau.md) | `pip install 'acryl-datahub[tableau]'` | Tableau source | +| [trino](../metadata-ingestion/source_docs/trino.md) | `pip install 'acryl-datahub[trino]` | Trino source | +| [starburst-trino-usage](../metadata-ingestion/source_docs/trino.md) | `pip install 'acryl-datahub[starburst-trino-usage]'` | Starburst Trino usage statistics source | +| [nifi](../metadata-ingestion/source_docs/nifi.md) | `pip install 'acryl-datahub[nifi]` | Nifi source | + +### Sinks + +| Plugin Name | Install Command | Provides | +| --------------------------------------- | -------------------------------------------- | -------------------------- | +| [file](../metadata-ingestion/sink_docs/file.md) | _included by default_ | File source and sink | +| [console](../metadata-ingestion/sink_docs/console.md) | _included by default_ | Console sink | +| [datahub-rest](../metadata-ingestion/sink_docs/datahub.md) | `pip install 'acryl-datahub[datahub-rest]'` | DataHub sink over REST API | +| [datahub-kafka](../metadata-ingestion/sink_docs/datahub.md) | `pip install 'acryl-datahub[datahub-kafka]'` | DataHub sink over Kafka | + +These plugins can be mixed and matched as desired. For example: + +```shell +pip install 'acryl-datahub[bigquery,datahub-rest]' +``` + +### Check the active plugins + +```shell +datahub check plugins +``` + +[extra requirements]: https://www.python-ldap.org/en/python-ldap-3.3.0/installing.html#build-prerequisites + ## User Guide The `datahub` cli allows you to do many things, such as quickstarting a DataHub docker instance locally, ingesting metadata from your sources, as well as retrieving and modifying metadata. diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md index 1adc5033c8ee2f..2a028b32283840 100644 --- a/metadata-ingestion/README.md +++ b/metadata-ingestion/README.md @@ -1,4 +1,4 @@ -# Metadata Ingestion +# Intro to Metadata Ingestion ![Python version 3.6+](https://img.shields.io/badge/python-3.6%2B-blue) @@ -10,95 +10,50 @@ It can be used through our CLI tool, with an orchestrator like Airflow, or as a ### Prerequisites -Before running any metadata ingestion job, you should make sure that DataHub backend services are all running. If you are trying this out locally, the easiest way to do that is through [quickstart Docker images](../docker). +Before running any metadata ingestion job, you should make sure that DataHub backend services are all running. If you are trying this out locally check out the [CLI](../docs/cli.md) to install the CLI and understand the options available in the CLI. You can reference the CLI usage guide given there as you go through this page. -### Install from PyPI +## Recipes -The folks over at [Acryl Data](https://www.acryl.io/) maintain a PyPI package for DataHub metadata ingestion. +A recipe is a configuration file that tells our ingestion scripts where to pull data from (source) and where to put it (sink). +Here's a simple example that pulls metadata from MSSQL (source) and puts it into datahub rest (sink). -```shell -# Requires Python 3.6+ -python3 -m pip install --upgrade pip wheel setuptools -python3 -m pip install --upgrade acryl-datahub -datahub version -# If you see "command not found", try running this instead: python3 -m datahub version -``` +> Note that one recipe file can only have 1 source and 1 sink. If you want multiple sources then you will need multiple recipe files. -If you run into an error, try checking the [_common setup issues_](./developing.md#Common-setup-issues). - -#### Installing Plugins - -We use a plugin architecture so that you can install only the dependencies you actually need. Click the plugin name to learn more about the specific source recipe and any FAQs! - -Sources: - -| Plugin Name | Install Command | Provides | -|-----------------------------------------------------------------|------------------------------------------------------------| ----------------------------------- | -| [file](./source_docs/file.md) | _included by default_ | File source and sink | -| [athena](./source_docs/athena.md) | `pip install 'acryl-datahub[athena]'` | AWS Athena source | -| [bigquery](./source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery]'` | BigQuery source | -| [bigquery-usage](./source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery-usage]'` | BigQuery usage statistics source | -| [datahub-business-glossary](./source_docs/business_glossary.md) | _no additional dependencies_ | Business Glossary File source | -| [dbt](./source_docs/dbt.md) | _no additional dependencies_ | dbt source | -| [druid](./source_docs/druid.md) | `pip install 'acryl-datahub[druid]'` | Druid Source | -| [feast](./source_docs/feast.md) | `pip install 'acryl-datahub[feast]'` | Feast source | -| [glue](./source_docs/glue.md) | `pip install 'acryl-datahub[glue]'` | AWS Glue source | -| [hive](./source_docs/hive.md) | `pip install 'acryl-datahub[hive]'` | Hive source | -| [kafka](./source_docs/kafka.md) | `pip install 'acryl-datahub[kafka]'` | Kafka source | -| [kafka-connect](./source_docs/kafka-connect.md) | `pip install 'acryl-datahub[kafka-connect]'` | Kafka connect source | -| [ldap](./source_docs/ldap.md) | `pip install 'acryl-datahub[ldap]'` ([extra requirements]) | LDAP source | -| [looker](./source_docs/looker.md) | `pip install 'acryl-datahub[looker]'` | Looker source | -| [lookml](./source_docs/lookml.md) | `pip install 'acryl-datahub[lookml]'` | LookML source, requires Python 3.7+ | -| [metabase](./source_docs/metabase.md) | `pip install 'acryl-datahub[metabase]` | Metabase source | -| [mode](./source_docs/mode.md) | `pip install 'acryl-datahub[mode]'` | Mode Analytics source | -| [mongodb](./source_docs/mongodb.md) | `pip install 'acryl-datahub[mongodb]'` | MongoDB source | -| [mssql](./source_docs/mssql.md) | `pip install 'acryl-datahub[mssql]'` | SQL Server source | -| [mysql](./source_docs/mysql.md) | `pip install 'acryl-datahub[mysql]'` | MySQL source | -| [mariadb](./source_docs/mariadb.md) | `pip install 'acryl-datahub[mariadb]'` | MariaDB source | -| [openapi](./source_docs/openapi.md) | `pip install 'acryl-datahub[openapi]'` | OpenApi Source | -| [oracle](./source_docs/oracle.md) | `pip install 'acryl-datahub[oracle]'` | Oracle source | -| [postgres](./source_docs/postgres.md) | `pip install 'acryl-datahub[postgres]'` | Postgres source | -| [redash](./source_docs/redash.md) | `pip install 'acryl-datahub[redash]'` | Redash source | -| [redshift](./source_docs/redshift.md) | `pip install 'acryl-datahub[redshift]'` | Redshift source | -| [sagemaker](./source_docs/sagemaker.md) | `pip install 'acryl-datahub[sagemaker]'` | AWS SageMaker source | -| [snowflake](./source_docs/snowflake.md) | `pip install 'acryl-datahub[snowflake]'` | Snowflake source | -| [snowflake-usage](./source_docs/snowflake.md) | `pip install 'acryl-datahub[snowflake-usage]'` | Snowflake usage statistics source | -| [sql-profiles](./source_docs/sql_profiles.md) | `pip install 'acryl-datahub[sql-profiles]'` | Data profiles for SQL-based systems | -| [sqlalchemy](./source_docs/sqlalchemy.md) | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source | -| [superset](./source_docs/superset.md) | `pip install 'acryl-datahub[superset]'` | Superset source | -| [tableau](./source_docs/tableau.md) | `pip install 'acryl-datahub[tableau]'` | Tableau source | -| [trino](./source_docs/trino.md) | `pip install 'acryl-datahub[trino]` | Trino source | -| [starburst-trino-usage](./source_docs/trino.md) | `pip install 'acryl-datahub[starburst-trino-usage]'` | Starburst Trino usage statistics source | -| [nifi](./source_docs/nifi.md) | `pip install 'acryl-datahub[nifi]` | Nifi source | - -Sinks - -| Plugin Name | Install Command | Provides | -| --------------------------------------- | -------------------------------------------- | -------------------------- | -| [file](./sink_docs/file.md) | _included by default_ | File source and sink | -| [console](./sink_docs/console.md) | _included by default_ | Console sink | -| [datahub-rest](./sink_docs/datahub.md) | `pip install 'acryl-datahub[datahub-rest]'` | DataHub sink over REST API | -| [datahub-kafka](./sink_docs/datahub.md) | `pip install 'acryl-datahub[datahub-kafka]'` | DataHub sink over Kafka | - -These plugins can be mixed and matched as desired. For example: +```yaml +# A sample recipe that pulls metadata from MSSQL and puts it into DataHub +# using the Rest API. +source: + type: mssql + config: + username: sa + password: ${MSSQL_PASSWORD} + database: DemoData -```shell -pip install 'acryl-datahub[bigquery,datahub-rest]' -``` +transformers: + - type: "fully-qualified-class-name-of-transformer" + config: + some_property: "some.value" -You can check the active plugins: -```shell -datahub check plugins +sink: + type: "datahub-rest" + config: + server: "http://localhost:8080" ``` -[extra requirements]: https://www.python-ldap.org/en/python-ldap-3.3.0/installing.html#build-prerequisites +A number of recipes are included in the [examples/recipes](./examples/recipes) directory. For full info and context on each source and sink, see the pages described in the [table of plugins](#installing-plugins). + +### Handling sensitive information in recipes + +We automatically expand environment variables in the config (e.g. `${MSSQL_PASSWORD}`), +similar to variable substitution in GNU bash or in docker-compose files. For details, see +https://docs.docker.com/compose/compose-file/compose-file-v2/#variable-substitution. This environment variable substitution should be used to mask sensitive information in recipe files. As long as you can get env variables securely to the ingestion process there would not be any need to store sensitive information in recipes. -#### Basic Usage +### Basic Usage of CLI for ingestion ```shell pip install 'acryl-datahub[datahub-rest]' # install the required plugin -datahub ingest -c ./examples/recipes/example_to_datahub_rest.yml +datahub ingest -c ./examples/recipes/mssql_to_datahub.yml ``` The `--dry-run` option of the `ingest` command performs all of the ingestion steps, except writing to the sink. This is useful to ensure that the @@ -120,65 +75,6 @@ datahub ingest -c ./examples/recipes/example_to_datahub_rest.yml --preview # Preview with dry-run datahub ingest -c ./examples/recipes/example_to_datahub_rest.yml -n --preview ``` - -### Install using Docker - -[![Docker Hub](https://img.shields.io/docker/pulls/linkedin/datahub-ingestion?style=plastic)](https://hub.docker.com/r/linkedin/datahub-ingestion) -[![datahub-ingestion docker](https://github.com/linkedin/datahub/actions/workflows/docker-ingestion.yml/badge.svg)](https://github.com/linkedin/datahub/actions/workflows/docker-ingestion.yml) - -If you don't want to install locally, you can alternatively run metadata ingestion within a Docker container. -We have prebuilt images available on [Docker hub](https://hub.docker.com/r/linkedin/datahub-ingestion). All plugins will be installed and enabled automatically. - -_Limitation: the datahub_docker.sh convenience script assumes that the recipe and any input/output files are accessible in the current working directory or its subdirectories. Files outside the current working directory will not be found, and you'll need to invoke the Docker image directly._ - -```shell -# Assumes the DataHub repo is cloned locally. -./metadata-ingestion/scripts/datahub_docker.sh ingest -c ./examples/recipes/example_to_datahub_rest.yml -``` - -### Install from source - -If you'd like to install from source, see the [developer guide](./developing.md). - -## Recipes - -A recipe is a configuration file that tells our ingestion scripts where to pull data from (source) and where to put it (sink). -Here's a simple example that pulls metadata from MSSQL and puts it into datahub. - -```yaml -# A sample recipe that pulls metadata from MSSQL and puts it into DataHub -# using the Rest API. -source: - type: mssql - config: - username: sa - password: ${MSSQL_PASSWORD} - database: DemoData - -transformers: - - type: "fully-qualified-class-name-of-transformer" - config: - some_property: "some.value" - -sink: - type: "datahub-rest" - config: - server: "http://localhost:8080" -``` - -Running a recipe is quite easy. - -```shell -datahub ingest -c ./examples/recipes/mssql_to_datahub.yml -``` - -A number of recipes are included in the [examples/recipes](./examples/recipes) directory. For full info and context on each source and sink, see the pages described in the [table of plugins](#installing-plugins). - -### Handling sensitive information in recipes - -We automatically expand environment variables in the config (e.g. `${MSSQL_PASSWORD}`), -similar to variable substitution in GNU bash or in docker-compose files. For details, see -https://docs.docker.com/compose/compose-file/compose-file-v2/#variable-substitution. This environment variable substitution should be used to mask sensitive information in recipe files. As long as you can get env variables securely to the ingestion process there would not be any need to store sensitive information in recipes. ## Transformations If you'd like to modify data before it reaches the ingestion sinks – for instance, adding additional owners or tags – you can use a transformer to write your own module and integrate it with DataHub.