From 9e960f8f15c10ededd667c372af4aefaf7419e8a Mon Sep 17 00:00:00 2001 From: Pedro Silva Date: Mon, 15 Mar 2021 10:44:03 +0000 Subject: [PATCH 1/7] Add druid metadata crawler to ingestion framework --- metadata-ingestion/README.md | 18 ++++++++++++++++ .../src/datahub/ingestion/source/druid.py | 21 +++++++++++++++++++ .../ingestion/source/source_registry.py | 8 ++++++- 3 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/druid.py diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md index b7c40913bef4b9..8b8b66ca706dfb 100644 --- a/metadata-ingestion/README.md +++ b/metadata-ingestion/README.md @@ -343,6 +343,24 @@ source: # table_pattern/schema_pattern is same as above ``` +### Druid `druid` + +Extracts: + +- List of databases, schema, and tables +- Column types associated with each table + +```yml +source: + type: druid + config: + # Point to broker address + host_port: localhost:8082 + database: DemoDatabase + # table_pattern/schema_pattern is same as above + # options is same as above +``` + ### LDAP `ldap` Extracts: diff --git a/metadata-ingestion/src/datahub/ingestion/source/druid.py b/metadata-ingestion/src/datahub/ingestion/source/druid.py new file mode 100644 index 00000000000000..24e073026667b5 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/druid.py @@ -0,0 +1,21 @@ +# This import verifies that the dependencies are available. +import pydruid # noqa: F401 + +from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource + +class DruidConfig(BasicSQLAlchemyConfig): + # defaults + scheme = "druid" + + def get_sql_alchemy_url(self): + return f"{BasicSQLAlchemyConfig.get_sql_alchemy_url(self)}/druid/v2/sql/" + + +class DruidSource(SQLAlchemySource): + def __init__(self, config, ctx): + super().__init__(config, ctx, "druid") + + @classmethod + def create(cls, config_dict, ctx): + config = DruidConfig.parse_obj(config_dict) + return cls(config, ctx) \ No newline at end of file diff --git a/metadata-ingestion/src/datahub/ingestion/source/source_registry.py b/metadata-ingestion/src/datahub/ingestion/source/source_registry.py index 077ad9aeaebb2a..fd34bcb470f6c2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/source_registry.py +++ b/metadata-ingestion/src/datahub/ingestion/source/source_registry.py @@ -64,10 +64,16 @@ except ImportError as e: source_registry.register_disabled("kafka", e) - try: from .ldap import LDAPSource source_registry.register("ldap", LDAPSource) except ImportError as e: source_registry.register_disabled("ldap", e) + +try: + from .druid import DruidSource + + source_registry.register("druid", DruidSource) +except ImportError as e: + source_registry.register_disabled("druid", e) \ No newline at end of file From 38596433817993772c12a4641d03d0ebd20dd1f1 Mon Sep 17 00:00:00 2001 From: Pedro Silva Date: Mon, 15 Mar 2021 22:01:04 +0000 Subject: [PATCH 2/7] Adds plugin requirements for druid crawler --- metadata-ingestion/README.md | 1 + metadata-ingestion/setup.py | 1 + 2 files changed, 2 insertions(+) diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md index 8b8b66ca706dfb..6eb83b7d98a30d 100644 --- a/metadata-ingestion/README.md +++ b/metadata-ingestion/README.md @@ -93,6 +93,7 @@ We use a plugin architecture so that you can install only the dependencies you a | snowflake | `pip install -e '.[snowflake]'` | Snowflake source | | ldap | `pip install -e '.[ldap]'` ([extra requirements]) | LDAP source | | kakfa | `pip install -e '.[kafka]'` | Kafka source | +| druid | `pip install -e '.[druid]'` | Druid Source | | datahub-rest | `pip install -e '.[datahub-rest]'` | DataHub sink over REST API | | datahub-kafka | `pip install -e '.[datahub-kafka]'` | DataHub sink over Kafka | diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 8ec7dfab0b20c8..ebc6dc44440a61 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -68,6 +68,7 @@ def get_long_description(): "postgres": sql_common | {"psycopg2-binary", "GeoAlchemy2"}, "snowflake": sql_common | {"snowflake-sqlalchemy"}, "ldap": {"python-ldap>=2.4"}, + "druid": sql_common | {"pydruid>=0.6.2"}, # Sink plugins. "datahub-kafka": kafka_common, "datahub-rest": {"requests>=2.25.1"}, From 6afe495488cf046418010b30041171c6b22faec7 Mon Sep 17 00:00:00 2001 From: Pedro Silva Date: Tue, 16 Mar 2021 10:59:09 +0000 Subject: [PATCH 3/7] Add schema pattern to avoid crawling internal druid databases --- metadata-ingestion/src/datahub/ingestion/source/druid.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/metadata-ingestion/src/datahub/ingestion/source/druid.py b/metadata-ingestion/src/datahub/ingestion/source/druid.py index 24e073026667b5..552fb4e9342d86 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/druid.py +++ b/metadata-ingestion/src/datahub/ingestion/source/druid.py @@ -2,10 +2,12 @@ import pydruid # noqa: F401 from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource +from datahub.configuration.common import AllowDenyPattern, ConfigModel class DruidConfig(BasicSQLAlchemyConfig): # defaults scheme = "druid" + schema_pattern: AllowDenyPattern = AllowDenyPattern(deny=["^(lookup|sys).*"]) def get_sql_alchemy_url(self): return f"{BasicSQLAlchemyConfig.get_sql_alchemy_url(self)}/druid/v2/sql/" From 7db3c6bb00c2fccb6076cb5975c399806f3f194e Mon Sep 17 00:00:00 2001 From: Pedro Silva Date: Tue, 16 Mar 2021 17:13:37 +0000 Subject: [PATCH 4/7] Adds warning note about druid-internal databases and how to specify the deny schema patterns + code formats --- metadata-ingestion/README.md | 9 +++++++-- metadata-ingestion/src/datahub/ingestion/source/druid.py | 8 +++++--- .../src/datahub/ingestion/source/source_registry.py | 2 +- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md index 6eb83b7d98a30d..7e596cdce53096 100644 --- a/metadata-ingestion/README.md +++ b/metadata-ingestion/README.md @@ -351,14 +351,19 @@ Extracts: - List of databases, schema, and tables - Column types associated with each table +**Note** It is important to define a explicitly define deny schema pattern for internal druid databases (lookup & sys) +if adding a schema pattern otherwise the crawler may crash before processing relevant databases. +This deny pattern is defined by default but is overriden by user-submitted configurations + ```yml source: type: druid config: # Point to broker address host_port: localhost:8082 - database: DemoDatabase - # table_pattern/schema_pattern is same as above + schema_pattern: + deny: + - "^(lookup|sys).*" # options is same as above ``` diff --git a/metadata-ingestion/src/datahub/ingestion/source/druid.py b/metadata-ingestion/src/datahub/ingestion/source/druid.py index 552fb4e9342d86..805634807232f5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/druid.py +++ b/metadata-ingestion/src/datahub/ingestion/source/druid.py @@ -1,9 +1,11 @@ # This import verifies that the dependencies are available. -import pydruid # noqa: F401 +import pydruid # noqa: F401 -from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource from datahub.configuration.common import AllowDenyPattern, ConfigModel +from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource + + class DruidConfig(BasicSQLAlchemyConfig): # defaults scheme = "druid" @@ -20,4 +22,4 @@ def __init__(self, config, ctx): @classmethod def create(cls, config_dict, ctx): config = DruidConfig.parse_obj(config_dict) - return cls(config, ctx) \ No newline at end of file + return cls(config, ctx) diff --git a/metadata-ingestion/src/datahub/ingestion/source/source_registry.py b/metadata-ingestion/src/datahub/ingestion/source/source_registry.py index fd34bcb470f6c2..d9e290678b3907 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/source_registry.py +++ b/metadata-ingestion/src/datahub/ingestion/source/source_registry.py @@ -76,4 +76,4 @@ source_registry.register("druid", DruidSource) except ImportError as e: - source_registry.register_disabled("druid", e) \ No newline at end of file + source_registry.register_disabled("druid", e) From 955d5cea393dcc974131e086f42e70aff5922c47 Mon Sep 17 00:00:00 2001 From: Pedro Silva Date: Tue, 16 Mar 2021 17:54:42 +0000 Subject: [PATCH 5/7] Remove unused import --- metadata-ingestion/src/datahub/ingestion/source/druid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/druid.py b/metadata-ingestion/src/datahub/ingestion/source/druid.py index 805634807232f5..7d3171b5ff8b7b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/druid.py +++ b/metadata-ingestion/src/datahub/ingestion/source/druid.py @@ -1,7 +1,7 @@ # This import verifies that the dependencies are available. import pydruid # noqa: F401 -from datahub.configuration.common import AllowDenyPattern, ConfigModel +from datahub.configuration.common import AllowDenyPattern from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource From dce658c4da462e38e0b4f09147bd8480d410dcce Mon Sep 17 00:00:00 2001 From: Pedro Silva Date: Tue, 16 Mar 2021 22:28:45 +0000 Subject: [PATCH 6/7] Add pydruid to list of missing packages types --- metadata-ingestion/setup.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/metadata-ingestion/setup.cfg b/metadata-ingestion/setup.cfg index b0ea9844a826cc..8bdff60c7751ea 100644 --- a/metadata-ingestion/setup.cfg +++ b/metadata-ingestion/setup.cfg @@ -42,6 +42,8 @@ ignore_missing_imports = yes ignore_missing_imports = yes [mypy-snowflake.*] ignore_missing_imports = yes +[mypy-pydruid.*] +ignore_missing_imports = yes [isort] profile = black From 7a9f64aaf7ffc0bf40a235b7a92d795851caaf46 Mon Sep 17 00:00:00 2001 From: Pedro Silva Date: Wed, 17 Mar 2021 16:31:52 +0000 Subject: [PATCH 7/7] Adjusts test code coverage requirement to 75% --- metadata-ingestion/setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/setup.cfg b/metadata-ingestion/setup.cfg index 8bdff60c7751ea..2187ede78ce1a7 100644 --- a/metadata-ingestion/setup.cfg +++ b/metadata-ingestion/setup.cfg @@ -58,7 +58,7 @@ testpaths = tests/integration [coverage:report] -fail_under = 80 +fail_under = 75 show_missing = true exclude_lines = pragma: no cover