From 7591c8994a387c365221caa5527d6c03d3f53f4e Mon Sep 17 00:00:00 2001 From: John Joyce Date: Mon, 14 Jun 2021 17:15:24 -0700 Subject: [PATCH] feat(datahub cli): DataHub CLI Quickstart (#2689) --- .github/workflows/build-and-test.yml | 15 +- docker/docker-compose.override.yml | 11 + docker/elasticsearch/env/docker.env | 2 +- docker/mysql-setup/env/docker.env | 5 + .../quickstart/docker-compose.quickstart.yml | 208 +++++++++++++ docker/quickstart/generate_and_compare.sh | 20 ++ .../quickstart/generate_docker_quickstart.py | 113 +++++++ .../quickstart/generate_docker_quickstart.sh | 13 + docker/quickstart/requirements.txt | 3 + docs/quickstart.md | 72 ++++- metadata-ingestion/setup.cfg | 2 +- .../src/datahub/{check => cli}/__init__.py | 0 .../src/datahub/{check => cli}/check_cli.py | 30 +- metadata-ingestion/src/datahub/cli/docker.py | 290 ++++++++++++++++++ .../{check/docker.py => cli/docker_check.py} | 28 +- .../src/datahub/{check => cli}/json_file.py | 0 metadata-ingestion/src/datahub/entrypoints.py | 21 +- .../src/datahub/ingestion/api/registry.py | 5 +- smoke-test/smoke.sh | 16 +- smoke-test/test_e2e.py | 23 +- 20 files changed, 795 insertions(+), 82 deletions(-) create mode 100644 docker/mysql-setup/env/docker.env create mode 100644 docker/quickstart/docker-compose.quickstart.yml create mode 100755 docker/quickstart/generate_and_compare.sh create mode 100644 docker/quickstart/generate_docker_quickstart.py create mode 100755 docker/quickstart/generate_docker_quickstart.sh create mode 100644 docker/quickstart/requirements.txt rename metadata-ingestion/src/datahub/{check => cli}/__init__.py (100%) rename metadata-ingestion/src/datahub/{check => cli}/check_cli.py (59%) create mode 100644 metadata-ingestion/src/datahub/cli/docker.py rename metadata-ingestion/src/datahub/{check/docker.py => cli/docker_check.py} (87%) rename metadata-ingestion/src/datahub/{check => cli}/json_file.py (100%) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 5eb51ac773b07..9028f3634d769 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -65,10 +65,7 @@ jobs: - name: Gradle build run: ./gradlew build -x check -x docs-website:build - name: Smoke test - run: | - ./docker/dev.sh -d - sleep 30 - ./smoke-test/smoke.sh + run: ./smoke-test/smoke.sh - name: Slack failure notification if: failure() && github.event_name == 'push' uses: kpritam/slack-job-status-action@v1 @@ -76,3 +73,13 @@ jobs: job-status: ${{ job.status }} slack-bot-token: ${{ secrets.SLACK_BOT_TOKEN }} channel: github-activities + + quickstart-compose-validation: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: "3.6" + - name: Quickstart Compose Validation + run: ./docker/quickstart/generate_and_compare.sh \ No newline at end of file diff --git a/docker/docker-compose.override.yml b/docker/docker-compose.override.yml index 5a9dece77282c..63180a9131ca1 100644 --- a/docker/docker-compose.override.yml +++ b/docker/docker-compose.override.yml @@ -14,6 +14,17 @@ services: - ./mysql/init.sql:/docker-entrypoint-initdb.d/init.sql - mysqldata:/var/lib/mysql + mysql-setup: + build: + context: ../ + dockerfile: docker/mysql-setup/Dockerfile + image: acryldata/datahub-mysql-setup:head + env_file: mysql-setup/env/docker.env + hostname: mysql-setup + container_name: mysql-setup + depends_on: + - mysql + datahub-gms: env_file: datahub-gms/env/docker.env depends_on: diff --git a/docker/elasticsearch/env/docker.env b/docker/elasticsearch/env/docker.env index 75b52bd98353f..da1be52a1ed91 100644 --- a/docker/elasticsearch/env/docker.env +++ b/docker/elasticsearch/env/docker.env @@ -1,3 +1,3 @@ discovery.type=single-node xpack.security.enabled=false -ES_JAVA_OPTS=-Xms1g -Xmx1g +ES_JAVA_OPTS=-Xms512m -Xmx512m diff --git a/docker/mysql-setup/env/docker.env b/docker/mysql-setup/env/docker.env new file mode 100644 index 0000000000000..49c6ccbbd2bd8 --- /dev/null +++ b/docker/mysql-setup/env/docker.env @@ -0,0 +1,5 @@ +MYSQL_HOST=mysql +MYSQL_PORT=3306 +MYSQL_USERNAME=datahub +MYSQL_PASSWORD=datahub +DATAHUB_DB_NAME=datahub \ No newline at end of file diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml new file mode 100644 index 0000000000000..6b0886e1bad2e --- /dev/null +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -0,0 +1,208 @@ +networks: + default: + name: datahub_network +services: + broker: + container_name: broker + depends_on: + - zookeeper + environment: + - KAFKA_BROKER_ID=1 + - KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181 + - KAFKA_LISTENER_SECURITY_PROTOCOL_MAP=PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT + - KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092 + - KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1 + - KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS=0 + hostname: broker + image: confluentinc/cp-kafka:5.4.0 + ports: + - 29092:29092 + - 9092:9092 + datahub-frontend-react: + container_name: datahub-frontend-react + depends_on: + - datahub-gms + environment: + - DATAHUB_GMS_HOST=datahub-gms + - DATAHUB_GMS_PORT=8080 + - DATAHUB_SECRET=YouKnowNothing + - DATAHUB_APP_VERSION=1.0 + - DATAHUB_PLAY_MEM_BUFFER_SIZE=10MB + - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - DATAHUB_TRACKING_TOPIC=DataHubUsageEvent_v1 + - ELASTIC_CLIENT_HOST=elasticsearch + - ELASTIC_CLIENT_PORT=9200 + hostname: datahub-frontend-react + image: linkedin/datahub-frontend-react:${DATAHUB_VERSION:-latest} + ports: + - 9002:9002 + datahub-gms: + container_name: datahub-gms + depends_on: + - mysql + environment: + - DATASET_ENABLE_SCSI=false + - EBEAN_DATASOURCE_USERNAME=datahub + - EBEAN_DATASOURCE_PASSWORD=datahub + - EBEAN_DATASOURCE_HOST=mysql:3306 + - EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8 + - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver + - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 + - ELASTICSEARCH_HOST=elasticsearch + - ELASTICSEARCH_PORT=9200 + - NEO4J_HOST=http://neo4j:7474 + - NEO4J_URI=bolt://neo4j + - NEO4J_USERNAME=neo4j + - NEO4J_PASSWORD=datahub + hostname: datahub-gms + image: linkedin/datahub-gms:${DATAHUB_VERSION:-latest} + mem_limit: 850m + ports: + - 8080:8080 + datahub-mae-consumer: + container_name: datahub-mae-consumer + depends_on: + - kafka-setup + - elasticsearch-setup + - neo4j + environment: + - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 + - ELASTICSEARCH_HOST=elasticsearch + - ELASTICSEARCH_PORT=9200 + - NEO4J_HOST=http://neo4j:7474 + - NEO4J_URI=bolt://neo4j + - NEO4J_USERNAME=neo4j + - NEO4J_PASSWORD=datahub + - GMS_HOST=datahub-gms + - GMS_PORT=8080 + hostname: datahub-mae-consumer + image: linkedin/datahub-mae-consumer:${DATAHUB_VERSION:-latest} + mem_limit: 256m + ports: + - 9091:9091 + datahub-mce-consumer: + container_name: datahub-mce-consumer + depends_on: + - kafka-setup + - datahub-gms + environment: + - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 + - GMS_HOST=datahub-gms + - GMS_PORT=8080 + hostname: datahub-mce-consumer + image: linkedin/datahub-mce-consumer:${DATAHUB_VERSION:-latest} + mem_limit: 384m + ports: + - 9090:9090 + elasticsearch: + container_name: elasticsearch + environment: + - discovery.type=single-node + - xpack.security.enabled=false + - ES_JAVA_OPTS=-Xms512m -Xmx512m + healthcheck: + retries: 4 + start_period: 2m + test: + - CMD-SHELL + - curl -sS --fail 'http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=0s' + || exit 1 + hostname: elasticsearch + image: elasticsearch:7.9.3 + mem_limit: 1g + ports: + - 9200:9200 + volumes: + - esdata:/usr/share/elasticsearch/data + elasticsearch-setup: + container_name: elasticsearch-setup + depends_on: + - elasticsearch + environment: + - ELASTICSEARCH_HOST=elasticsearch + - ELASTICSEARCH_PORT=9200 + - ELASTICSEARCH_PROTOCOL=http + hostname: elasticsearch-setup + image: linkedin/datahub-elasticsearch-setup:${DATAHUB_VERSION:-latest} + kafka-setup: + container_name: kafka-setup + depends_on: + - broker + - schema-registry + environment: + - KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181 + - KAFKA_BOOTSTRAP_SERVER=broker:29092 + hostname: kafka-setup + image: linkedin/datahub-kafka-setup:${DATAHUB_VERSION:-latest} + mysql: + command: --character-set-server=utf8mb4 --collation-server=utf8mb4_unicode_ci + container_name: mysql + environment: + - MYSQL_DATABASE=datahub + - MYSQL_USER=datahub + - MYSQL_PASSWORD=datahub + - MYSQL_ROOT_PASSWORD=datahub + hostname: mysql + image: mysql:5.7 + ports: + - 3306:3306 + volumes: + - ./mysql/init.sql:/docker-entrypoint-initdb.d/init.sql + - mysqldata:/var/lib/mysql + mysql-setup: + container_name: mysql-setup + depends_on: + - mysql + environment: + - MYSQL_HOST=mysql + - MYSQL_PORT=3306 + - MYSQL_USERNAME=datahub + - MYSQL_PASSWORD=datahub + - DATAHUB_DB_NAME=datahub + hostname: mysql-setup + image: acryldata/datahub-mysql-setup:head + neo4j: + container_name: neo4j + environment: + - NEO4J_AUTH=neo4j/datahub + - NEO4J_dbms_default__database=graph.db + - NEO4J_dbms_allow__upgrade=true + hostname: neo4j + image: neo4j:4.0.6 + ports: + - 7474:7474 + - 7687:7687 + volumes: + - neo4jdata:/data + schema-registry: + container_name: schema-registry + depends_on: + - zookeeper + - broker + environment: + - SCHEMA_REGISTRY_HOST_NAME=schemaregistry + - SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL=zookeeper:2181 + hostname: schema-registry + image: confluentinc/cp-schema-registry:5.4.0 + ports: + - 8081:8081 + zookeeper: + container_name: zookeeper + environment: + - ZOOKEEPER_CLIENT_PORT=2181 + - ZOOKEEPER_TICK_TIME=2000 + hostname: zookeeper + image: confluentinc/cp-zookeeper:5.4.0 + ports: + - 2181:2181 + volumes: + - zkdata:/var/opt/zookeeper +version: '2' +volumes: + esdata: null + mysqldata: null + neo4jdata: null + zkdata: null diff --git a/docker/quickstart/generate_and_compare.sh b/docker/quickstart/generate_and_compare.sh new file mode 100755 index 0000000000000..e9081222974f4 --- /dev/null +++ b/docker/quickstart/generate_and_compare.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +cd "$DIR" + +set -euxo pipefail + +python3 -m venv venv +source venv/bin/activate + +pip install -r requirements.txt +python generate_docker_quickstart.py ../docker-compose.yml ../docker-compose.override.yml temp.quickstart.yml + +if cmp docker-compose.quickstart.yml temp.quickstart.yml; then + printf 'docker-compose.quickstart.yml is up to date.' + exit 0 +else + printf 'docker-compose.quickstart.yml is out of date.' + exit 1 +fi \ No newline at end of file diff --git a/docker/quickstart/generate_docker_quickstart.py b/docker/quickstart/generate_docker_quickstart.py new file mode 100644 index 0000000000000..717f76b8860c7 --- /dev/null +++ b/docker/quickstart/generate_docker_quickstart.py @@ -0,0 +1,113 @@ +import os +from collections.abc import Mapping + +import click +import yaml +from dotenv import dotenv_values +from yaml import Loader + +# Generates a merged docker-compose file with env variables inlined. +# Usage: python3 docker_compose_cli_gen.py ../docker-compose.yml ../docker-compose.override.yml ../docker-compose-gen.yml + +omitted_services = [ + "kafka-rest-proxy", + "kafka-topics-ui", + "schema-registry-ui", + "kibana", +] +mem_limits = { + "datahub-gms": "850m", + "datahub-mae-consumer": "256m", + "datahub-mce-consumer": "384m", + "elasticsearch": "1g", +} + + +def dict_merge(dct, merge_dct): + for k, v in merge_dct.items(): + if k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], Mapping): + dict_merge(dct[k], merge_dct[k]) + else: + dct[k] = merge_dct[k] + + +def modify_docker_config(base_path, docker_yaml_config): + # 0. Filter out services to be omitted. + for key in list(docker_yaml_config["services"]): + if key in omitted_services: + del docker_yaml_config["services"][key] + + for name, service in docker_yaml_config["services"].items(): + # 1. Extract the env file pointer + env_file = service.get("env_file") + + if env_file is not None: + # 2. Construct full .env path + env_file_path = os.path.join(base_path, env_file) + + # 3. Resolve the .env values + env_vars = dotenv_values(env_file_path) + + # 4. Add an "environment" block to YAML + service["environment"] = list( + f"{key}={value}" for key, value in env_vars.items() + ) + + # 5. Delete the "env_file" value + del service["env_file"] + + # 6. Delete build instructions + if "build" in service: + del service["build"] + + # 7. Set memory limits + if name in mem_limits: + service["mem_limit"] = mem_limits[name] + + # 8. Set docker compose version to 2. + docker_yaml_config["version"] = "2" + + +@click.command() +@click.argument( + "compose-files", + nargs=-1, + type=click.Path( + exists=True, + dir_okay=False, + ), +) +@click.argument("output-file", type=click.Path()) +def generate(compose_files, output_file) -> None: + + # Resolve .env files to inlined vars + modified_files = [] + for compose_file in compose_files: + with open(compose_file, "r") as orig_conf: + docker_config = yaml.load(orig_conf, Loader=Loader) + + base_path = os.path.dirname(compose_file) + modify_docker_config(base_path, docker_config) + modified_files.append(docker_config) + + # Merge services, networks, and volumes maps + merged_docker_config = modified_files[0] + for modified_file in modified_files: + dict_merge(merged_docker_config, modified_file) + + # Write output file + output_dir = os.path.dirname(output_file) + if len(output_dir) and not os.path.exists(output_dir): + os.makedirs(output_dir) + with open(output_file, "w") as new_conf_file: + yaml.dump( + merged_docker_config, + new_conf_file, + default_flow_style=False, + ) + + print(f"Successfully generated {output_file}.") + + +if __name__ == "__main__": + generate() diff --git a/docker/quickstart/generate_docker_quickstart.sh b/docker/quickstart/generate_docker_quickstart.sh new file mode 100755 index 0000000000000..bdef531f24a6d --- /dev/null +++ b/docker/quickstart/generate_docker_quickstart.sh @@ -0,0 +1,13 @@ +#!/bin/sh + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +cd "$DIR" + +set -euxo pipefail + +python3 -m venv venv +source venv/bin/activate + +pip install -r requirements.txt +python generate_docker_quickstart.py ../docker-compose.yml ../docker-compose.override.yml docker-compose.quickstart.yml + diff --git a/docker/quickstart/requirements.txt b/docker/quickstart/requirements.txt new file mode 100644 index 0000000000000..a20e96afc8582 --- /dev/null +++ b/docker/quickstart/requirements.txt @@ -0,0 +1,3 @@ +PyYAML==5.4.1 +python-dotenv==0.17.0 +click diff --git a/docs/quickstart.md b/docs/quickstart.md index baee52c962711..8c6d7f1d007f0 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -1,19 +1,59 @@ # DataHub Quickstart Guide +## Deploying DataHub + +To deploy a new instance of DataHub, perform the following steps. + 1. Install [docker](https://docs.docker.com/install/) and [docker-compose](https://docs.docker.com/compose/install/) (if using Linux). Make sure to allocate enough hardware resources for Docker engine. Tested & confirmed config: 2 CPUs, 8GB RAM, 2GB Swap area. -2. Open Docker either from the command line or the desktop app and ensure it is up and running. -3. Clone this repo and `cd` into the root directory of the cloned repository. -4. Run the following command to download and run all Docker containers locally: - ``` - ./docker/quickstart.sh - ``` - This step takes a while to run the first time, and it may be difficult to tell if DataHub is fully up and running from the combined log. Please use [this guide](debugging.md#how-can-i-confirm-if-all-docker-containers-are-running-as-expected-after-a-quickstart) to verify that each container is running correctly. -5. At this point, you should be able to start DataHub by opening [http://localhost:9002](http://localhost:9002) in your browser. You can sign in using `datahub` as username and any password (no password validation by default). However, you'll notice that no data has been ingested yet. -6. To ingest provided [sample data](https://github.com/linkedin/datahub/blob/master/metadata-ingestion/mce-cli/bootstrap_mce.dat) to DataHub, switch to a new terminal window, `cd` into the cloned `datahub` repo, and run the following command: - ``` - ./docker/ingestion/ingestion.sh - ``` - After running this, you should be able to see and search sample datasets in DataHub. -7. That's it! To get some real data into DataHub, take a look at the [ingestion framework](../metadata-ingestion/README.md). - -Please refer to the [debugging guide](debugging.md) if you encounter any issues during the quickstart. + + +2. Launch the Docker Engine from command line or the desktop app. + + +3. Install the DataHub CLI + + a. Ensure you have Python 3.6+ installed & configured. (Check using `python3 --version`) + + b. Run the following commands in your terminal + ``` + python3 -m pip install --upgrade pip wheel setuptools + python3 -m pip uninstall datahub acryl-datahub || true # sanity check - ok if it fails + python3 -m pip install --upgrade acryl-datahub + datahub version + ``` + If you see "command not found", try running cli commands with the prefix 'python3 -m' instead: `python3 -m datahub version` + + +4. To deploy DataHub, run the following CLI command from your terminal + ``` + datahub docker quickstart + ``` + Upon completion of this step, you should be able to navigate to the DataHub UI at [http://localhost:9002](http://localhost:9002) in your browser. You can sign in using `datahub` as username and any password (no password validation by default). + + +5. To ingest the sample metadata, run the following CLI command from your terminal + ``` + datahub docker ingest-sample-data + ``` + +That's it! To start pushing your company's metadata into DataHub, take a look at the [Metadata Ingestion Framework](../metadata-ingestion/README.md). + + +## Resetting DataHub + +To cleanse DataHub of all of it's state (e.g. before ingesting your own), you can use the CLI `nuke` command. + +``` +datahub docker nuke +``` + + +## FAQ + +### Command not found: datahub + +If running the datahub cli produces "command not found" errors inside your terminal, your system may be defaulting to an older +version of Python. Try prefixing your `datahub` commands with `python3 -m`: +``` +python3 -m datahub docker quickstart +``` \ No newline at end of file diff --git a/metadata-ingestion/setup.cfg b/metadata-ingestion/setup.cfg index d30010733c00a..29fcd053d0b4c 100644 --- a/metadata-ingestion/setup.cfg +++ b/metadata-ingestion/setup.cfg @@ -45,7 +45,7 @@ testpaths = tests/integration [coverage:report] -fail_under = 75 +fail_under = 70 show_missing = true exclude_lines = pragma: no cover diff --git a/metadata-ingestion/src/datahub/check/__init__.py b/metadata-ingestion/src/datahub/cli/__init__.py similarity index 100% rename from metadata-ingestion/src/datahub/check/__init__.py rename to metadata-ingestion/src/datahub/cli/__init__.py diff --git a/metadata-ingestion/src/datahub/check/check_cli.py b/metadata-ingestion/src/datahub/cli/check_cli.py similarity index 59% rename from metadata-ingestion/src/datahub/check/check_cli.py rename to metadata-ingestion/src/datahub/cli/check_cli.py index d80c3ca0aa6ab..62ac981c102c3 100644 --- a/metadata-ingestion/src/datahub/check/check_cli.py +++ b/metadata-ingestion/src/datahub/cli/check_cli.py @@ -1,23 +1,23 @@ -import sys - import click from datahub import __package_name__ -from datahub.check.docker import check_local_docker_containers -from datahub.check.json_file import check_mce_file +from datahub.cli.docker import docker_check_impl +from datahub.cli.json_file import check_mce_file from datahub.ingestion.sink.sink_registry import sink_registry from datahub.ingestion.source.source_registry import source_registry +from datahub.ingestion.transformer.transform_registry import transform_registry @click.group() def check() -> None: + """Helper commands for checking various aspects of DataHub.""" pass @check.command() @click.argument("json-file", type=click.Path(exists=True, dir_okay=False)) def mce_file(json_file: str) -> None: - """Check the schema of a MCE JSON file""" + """Check the schema of a MCE JSON file.""" report = check_mce_file(json_file) click.echo(report) @@ -25,16 +25,9 @@ def mce_file(json_file: str) -> None: @check.command() def local_docker() -> None: - """Check that the local Docker containers are healthy""" - - issues = check_local_docker_containers() - if not issues: - click.secho("✔ No issues detected", fg="green") - else: - click.secho("The following issues were detected:", fg="bright_red") - for issue in issues: - click.echo(f"- {issue}") - sys.exit(1) + """Check that the local Docker containers are healthy. (deprecated)""" + click.secho("DeprecationWarning: use `datahub docker check` instead", fg="yellow") + docker_check_impl() @check.command() @@ -44,10 +37,10 @@ def local_docker() -> None: type=bool, is_flag=True, default=False, - help="Include extra information for each plugin", + help="Include extra information for each plugin.", ) def plugins(verbose: bool) -> None: - """Check the enabled ingestion plugins""" + """List the enabled ingestion plugins.""" click.secho("Sources:", bold=True) click.echo(source_registry.summary(verbose=verbose)) @@ -55,6 +48,9 @@ def plugins(verbose: bool) -> None: click.secho("Sinks:", bold=True) click.echo(sink_registry.summary(verbose=verbose)) click.echo() + click.secho("Transformers:", bold=True) + click.echo(transform_registry.summary(verbose=verbose, col_width=30)) + click.echo() if not verbose: click.echo("For details on why a plugin is disabled, rerun with '--verbose'") click.echo( diff --git a/metadata-ingestion/src/datahub/cli/docker.py b/metadata-ingestion/src/datahub/cli/docker.py new file mode 100644 index 0000000000000..89cbf7efa45c8 --- /dev/null +++ b/metadata-ingestion/src/datahub/cli/docker.py @@ -0,0 +1,290 @@ +import datetime +import itertools +import os +import pathlib +import subprocess +import sys +import tempfile +import time +from typing import List, NoReturn, Optional + +import click +import requests + +from datahub.cli.docker_check import ( + check_local_docker_containers, + get_client_with_error, +) +from datahub.ingestion.run.pipeline import Pipeline + +SIMPLE_QUICKSTART_COMPOSE_FILE = "docker/quickstart/docker-compose.quickstart.yml" +BOOTSTRAP_MCES_FILE = "metadata-ingestion/examples/mce_files/bootstrap_mce.json" + +GITHUB_BASE_URL = "https://raw.githubusercontent.com/linkedin/datahub/master" +GITHUB_QUICKSTART_COMPOSE_URL = f"{GITHUB_BASE_URL}/{SIMPLE_QUICKSTART_COMPOSE_FILE}" +GITHUB_BOOTSTRAP_MCES_URL = f"{GITHUB_BASE_URL}/{BOOTSTRAP_MCES_FILE}" + + +@click.group() +def docker() -> None: + """Helper commands for setting up and interacting with a local + DataHub instance using Docker.""" + pass + + +def _print_issue_list_and_exit( + issues: List[str], header: str, footer: Optional[str] = None +) -> NoReturn: + click.secho(header, fg="bright_red") + for issue in issues: + click.echo(f"- {issue}") + if footer: + click.echo() + click.echo(footer) + sys.exit(1) + + +def docker_check_impl() -> None: + issues = check_local_docker_containers() + if not issues: + click.secho("✔ No issues detected", fg="green") + else: + _print_issue_list_and_exit(issues, "The following issues were detected:") + + +@docker.command() +def check() -> None: + """Check that the Docker containers are healthy""" + docker_check_impl() + + +@docker.command() +@click.option( + "--version", + type=str, + default="head", + help="Datahub version to be deployed. If not set, deploy latest", +) +@click.option( + "--build-locally", + type=bool, + is_flag=True, + default=False, + help="Attempt to build the containers locally before starting", +) +@click.option( + "--quickstart-compose-file", + type=click.Path(exists=True, dir_okay=False, readable=True), + default=[], + multiple=True, + help="Use a local docker-compose file instead of pulling from GitHub", +) +@click.option( + "--dump-logs-on-failure", + type=bool, + is_flag=True, + default=False, + help="If true, the docker-compose logs will be printed to console if something fails", +) +def quickstart( + version: str, + build_locally: bool, + quickstart_compose_file: List[pathlib.Path], + dump_logs_on_failure: bool, +) -> None: + """Start an instance of DataHub locally using docker-compose. + + This command will automatically download the latest docker-compose configuration + from GitHub, pull the latest images, and bring up the DataHub system. + There are options to override the docker-compose config file, build the containers + locally, and dump logs to the console or to a file if something goes wrong. + """ + + # Run pre-flight checks. + issues = check_local_docker_containers(preflight_only=True) + if issues: + _print_issue_list_and_exit(issues, "Unable to run quickstart:") + + quickstart_compose_file = list( + quickstart_compose_file + ) # convert to list from tuple + if not quickstart_compose_file: + click.echo("Fetching docker-compose file from GitHub") + with tempfile.NamedTemporaryFile(suffix=".yml", delete=False) as tmp_file: + path = pathlib.Path(tmp_file.name) + quickstart_compose_file.append(path) + + # Download the quickstart docker-compose file from GitHub. + quickstart_download_response = requests.get(GITHUB_QUICKSTART_COMPOSE_URL) + quickstart_download_response.raise_for_status() + tmp_file.write(quickstart_download_response.content) + + # set version + os.environ["DATAHUB_VERSION"] = version + + base_command: List[str] = [ + "docker-compose", + *itertools.chain.from_iterable( + ("-f", f"{path}") for path in quickstart_compose_file + ), + "-p", + "datahub", + ] + + # Pull and possibly build the latest containers. + subprocess.run( + [ + *base_command, + "pull", + ], + check=True, + ) + if build_locally: + subprocess.run( + [ + *base_command, + "build", + "--pull", + ], + check=True, + env={ + **os.environ, + "DOCKER_BUILDKIT": "1", + }, + ) + + # Start it up! (with retries) + max_wait_time = datetime.timedelta(minutes=6) + start_time = datetime.datetime.now() + sleep_interval = datetime.timedelta(seconds=2) + up_interval = datetime.timedelta(seconds=30) + up_attempts = 0 + while (datetime.datetime.now() - start_time) < max_wait_time: + # Attempt to run docker-compose up every minute. + if (datetime.datetime.now() - start_time) > up_attempts * up_interval: + click.echo() + subprocess.run(base_command + ["up", "-d"]) + up_attempts += 1 + + # Check docker health every few seconds. + issues = check_local_docker_containers() + if not issues: + break + + # Wait until next iteration. + click.echo(".", nl=False) + time.sleep(sleep_interval.total_seconds()) + else: + # Falls through if the while loop doesn't exit via break. + click.echo() + with tempfile.NamedTemporaryFile(suffix=".log", delete=False) as log_file: + ret = subprocess.run( + base_command + ["logs"], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + check=True, + ) + log_file.write(ret.stdout) + + if dump_logs_on_failure: + with open(log_file.name, "r") as logs: + click.echo("Dumping docker-compose logs:") + click.echo(logs.read()) + click.echo() + + _print_issue_list_and_exit( + issues, + header="Unable to run quickstart - the following issues were detected:", + footer="If you think something went wrong, please file an issue at https://github.com/linkedin/datahub/issues\n" + "or send a message in our Slack https://slack.datahubproject.io/\n" + f"Be sure to attach the logs from {log_file.name}", + ) + + # Handle success condition. + click.echo() + click.secho("✔ DataHub is now running", fg="green") + click.secho( + "Ingest some demo data using `datahub docker ingest-sample-data`,\n" + "or head to http://localhost:9002 (username: datahub, password: datahub) to play around with the frontend.", + fg="green", + ) + + +@docker.command() +@click.option( + "--path", + type=click.Path(exists=True, dir_okay=False), + help=f"The MCE json file to ingest. Defaults to downloading {BOOTSTRAP_MCES_FILE} from GitHub", +) +def ingest_sample_data(path: Optional[str]) -> None: + """Ingest sample data into a running DataHub instance.""" + + if path is None: + click.echo("Downloading sample data...") + with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tmp_file: + path = str(pathlib.Path(tmp_file.name)) + + # Download the bootstrap MCE file from GitHub. + mce_json_download_response = requests.get(GITHUB_BOOTSTRAP_MCES_URL) + mce_json_download_response.raise_for_status() + tmp_file.write(mce_json_download_response.content) + click.echo(f"Downloaded to {path}") + + # Verify that docker is up. + issues = check_local_docker_containers() + if issues: + _print_issue_list_and_exit( + issues, + header="Docker is not ready:", + footer="Try running `datahub docker quickstart` first", + ) + + # Run ingestion. + click.echo("Starting ingestion...") + pipeline = Pipeline.create( + { + "source": { + "type": "file", + "config": { + "filename": path, + }, + }, + "sink": { + "type": "datahub-rest", + "config": {"server": "http://localhost:8080"}, + }, + } + ) + pipeline.run() + ret = pipeline.pretty_print_summary() + sys.exit(ret) + + +@docker.command() +def nuke() -> None: + """Remove all Docker containers, networks, and volumes associated with DataHub.""" + + with get_client_with_error() as (client, error): + if error: + click.secho( + "Docker doesn't seem to be running. Did you start it?", fg="red" + ) + return + + click.echo("Removing containers in the datahub project") + for container in client.containers.list( + filters={"label": "com.docker.compose.project=datahub"} + ): + container.remove(v=True, force=True) + + click.echo("Removing volumes in the datahub project") + for volume in client.volumes.list( + filters={"label": "com.docker.compose.project=datahub"} + ): + volume.remove(force=True) + + click.echo("Removing networks in the datahub project") + for network in client.networks.list( + filters={"label": "com.docker.compose.project=datahub"} + ): + network.remove() diff --git a/metadata-ingestion/src/datahub/check/docker.py b/metadata-ingestion/src/datahub/cli/docker_check.py similarity index 87% rename from metadata-ingestion/src/datahub/check/docker.py rename to metadata-ingestion/src/datahub/cli/docker_check.py index e7ff5b66b56d8..8735d97cee264 100644 --- a/metadata-ingestion/src/datahub/check/docker.py +++ b/metadata-ingestion/src/datahub/cli/docker_check.py @@ -3,11 +3,6 @@ import docker -ENSURE_EXIT_SUCCESS = [ - "kafka-setup", - "elasticsearch-setup", -] - REQUIRED_CONTAINERS = [ "elasticsearch-setup", "elasticsearch", @@ -28,8 +23,20 @@ # "kafka-rest-proxy", ] +ENSURE_EXIT_SUCCESS = [ + "kafka-setup", + "elasticsearch-setup", + "mysql-setup", +] + +CONTAINERS_TO_CHECK_IF_PRESENT = [ + # We only add this container in some cases, but if it's present, we + # definitely want to check that it exits properly. + "mysql-setup", +] + # Docker seems to under-report memory allocated, so we also need a bit of buffer to account for it. -MIN_MEMORY_NEEDED = 6.75 # GB +MIN_MEMORY_NEEDED = 3.8 # GB @contextmanager @@ -49,7 +56,7 @@ def memory_in_gb(mem_bytes: int) -> float: return mem_bytes / (1024 * 1024 * 1000) -def check_local_docker_containers() -> List[str]: +def check_local_docker_containers(preflight_only: bool = False) -> List[str]: issues: List[str] = [] with get_client_with_error() as (client, error): if error: @@ -63,6 +70,9 @@ def check_local_docker_containers() -> List[str]: f"Total Docker memory configured {memory_in_gb(total_mem_configured):.2f}GB is below the minimum threshold {MIN_MEMORY_NEEDED}GB" ) + if preflight_only: + return issues + containers = client.containers.list( all=True, filters={ @@ -81,7 +91,9 @@ def check_local_docker_containers() -> List[str]: # Check that the containers are running and healthy. for container in containers: - if container.name not in REQUIRED_CONTAINERS: + if container.name not in ( + REQUIRED_CONTAINERS + CONTAINERS_TO_CHECK_IF_PRESENT + ): # Ignores things like "datahub-frontend" which are no longer used. # This way, we only check required containers like "datahub-frontend-react" # even if there are some old containers lying around. diff --git a/metadata-ingestion/src/datahub/check/json_file.py b/metadata-ingestion/src/datahub/cli/json_file.py similarity index 100% rename from metadata-ingestion/src/datahub/check/json_file.py rename to metadata-ingestion/src/datahub/cli/json_file.py diff --git a/metadata-ingestion/src/datahub/entrypoints.py b/metadata-ingestion/src/datahub/entrypoints.py index ec62cac7f3ab8..3ee9ecc277596 100644 --- a/metadata-ingestion/src/datahub/entrypoints.py +++ b/metadata-ingestion/src/datahub/entrypoints.py @@ -7,7 +7,8 @@ from pydantic import ValidationError import datahub as datahub_package -from datahub.check.check_cli import check +from datahub.cli.check_cli import check +from datahub.cli.docker import docker from datahub.configuration.config_loader import load_config_file from datahub.ingestion.run.pipeline import Pipeline @@ -25,7 +26,13 @@ logging.basicConfig(format=BASE_LOGGING_FORMAT) -@click.group() +@click.group( + context_settings=dict( + # Avoid truncation of help text. + # See https://github.com/pallets/click/issues/486. + max_content_width=120, + ) +) @click.option("--debug/--no-debug", default=False) @click.version_option( version=datahub_package.nice_version_name(), @@ -45,7 +52,7 @@ def datahub(debug: bool) -> None: @datahub.command() def version() -> None: - """Print version number and exit""" + """Print version number and exit.""" click.echo(f"DataHub CLI version: {datahub_package.nice_version_name()}") click.echo(f"Python version: {sys.version}") @@ -55,11 +62,11 @@ def version() -> None: "-c", "--config", type=click.Path(exists=True, dir_okay=False), - help="Config file in .toml or .yaml format", + help="Config file in .toml or .yaml format.", required=True, ) def ingest(config: str) -> None: - """Main command for ingesting metadata into DataHub""" + """Ingest metadata into DataHub.""" config_file = pathlib.Path(config) pipeline_config = load_config_file(config_file) @@ -77,12 +84,16 @@ def ingest(config: str) -> None: datahub.add_command(check) +datahub.add_command(docker) def main(**kwargs): # This wrapper prevents click from suppressing errors. try: sys.exit(datahub(standalone_mode=False, **kwargs)) + except click.exceptions.Abort: + # Click already automatically prints an abort message, so we can just exit. + sys.exit(1) except click.ClickException as error: error.show() sys.exit(1) diff --git a/metadata-ingestion/src/datahub/ingestion/api/registry.py b/metadata-ingestion/src/datahub/ingestion/api/registry.py index ade29f32a83ed..71248e60162ef 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/registry.py +++ b/metadata-ingestion/src/datahub/ingestion/api/registry.py @@ -84,10 +84,7 @@ def get(self, key: str) -> Type[T]: # If it's not an exception, then it's a registered type. return tp - def summary(self, verbose=True): - col_width = 15 - verbose_col_width = 20 - + def summary(self, verbose=True, col_width=15, verbose_col_width=20): lines = [] for key in sorted(self._mapping.keys()): line = f"{key}" diff --git a/smoke-test/smoke.sh b/smoke-test/smoke.sh index 88710700a6bff..ca2a2f39f9c89 100755 --- a/smoke-test/smoke.sh +++ b/smoke-test/smoke.sh @@ -1,26 +1,26 @@ #!/bin/bash +set -euxo pipefail # Runs a basic e2e test. It is not meant to be fully comprehensive, # but rather should catch obvious bugs before they make it into prod. # # Script assumptions: # - The gradle build has already been run. -# - Python 3.6+ is installed. -# - The metadata-ingestion codegen script has been run. -# - A full DataHub setup is running on localhost with standard ports. -# The easiest way to do this is by using the quickstart or dev -# quickstart scripts. +# - Python 3.6+ is installed and in the PATH. DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" cd "$DIR" -set -euxo pipefail - python3 -m venv venv source venv/bin/activate pip install --upgrade pip wheel setuptools pip install -r requirements.txt -(cd ../metadata-ingestion && ./scripts/codegen.sh) +datahub docker quickstart \ + --build-locally \ + --quickstart-compose-file ../docker/docker-compose.yml \ + --quickstart-compose-file ../docker/docker-compose.override.yml \ + --quickstart-compose-file ../docker/docker-compose.dev.yml \ + --dump-logs-on-failure pytest -vv diff --git a/smoke-test/test_e2e.py b/smoke-test/test_e2e.py index d25a3a2e2e67d..5ee68b4b01a58 100644 --- a/smoke-test/test_e2e.py +++ b/smoke-test/test_e2e.py @@ -1,9 +1,9 @@ import time + import pytest import requests - +from datahub.cli.docker import check_local_docker_containers from datahub.ingestion.run.pipeline import Pipeline -from datahub.check.docker import check_local_docker_containers GMS_ENDPOINT = "http://localhost:8080" FRONTEND_ENDPOINT = "http://localhost:9002" @@ -15,26 +15,13 @@ "X-RestLi-Protocol-Version": "2.0.0", } kafka_post_ingestion_wait_sec = 60 -healthcheck_wait_retries = 20 -healthcheck_wait_interval_sec = 15 @pytest.fixture(scope="session") def wait_for_healthchecks(): - tries = 0 - while tries < healthcheck_wait_retries: - if tries > 0: - time.sleep(healthcheck_wait_interval_sec) - tries += 1 - - issues = check_local_docker_containers() - if not issues: - print(f"finished waiting for healthchecks after {tries} tries") - yield - return - - issues_str = '\n'.join(f"- {issue}" for issue in issues) - raise RuntimeError(f"retry limit exceeded while waiting for docker healthchecks\n{issues_str}") + # Simply assert that everything is healthy, but don't wait. + assert not check_local_docker_containers() + yield @pytest.mark.dependency()