Skip to content

Commit

Permalink
feat(ingest): basic support for complex hive types (#2804)
Browse files Browse the repository at this point in the history
  • Loading branch information
hsheth2 authored Jul 1, 2021
1 parent 725abf5 commit 6fe663b
Show file tree
Hide file tree
Showing 8 changed files with 586 additions and 2 deletions.
2 changes: 1 addition & 1 deletion metadata-ingestion/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def get_long_description():
| {
# Acryl Data maintains a fork of PyHive, which adds support for table comments
# and column comments, and also releases HTTP and HTTPS transport schemes.
"acryl-pyhive[hive]>=0.6.9"
"acryl-pyhive[hive]>=0.6.10"
},
"ldap": {"python-ldap>=2.4"},
"looker": {"looker-sdk==21.6.0"},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,8 +206,8 @@ def get_schema_metadata(
for column in columns:
field = SchemaField(
fieldPath=column["name"],
nativeDataType=repr(column["type"]),
type=get_column_type(sql_report, dataset_name, column["type"]),
nativeDataType=column.get("full_type", repr(column["type"])),
description=column.get("comment", None),
nullable=column["nullable"],
recursive=False,
Expand Down
56 changes: 56 additions & 0 deletions metadata-ingestion/tests/integration/hive/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Adapted from https://github.com/big-data-europe/docker-hive.

version: "3"

services:
namenode:
image: bde2020/hadoop-namenode:2.0.0-hadoop2.7.4-java8
volumes:
- namenode:/hadoop/dfs/name
environment:
- CLUSTER_NAME=test
env_file:
- ./hadoop-hive.env
ports:
- "50070:50070"
datanode:
image: bde2020/hadoop-datanode:2.0.0-hadoop2.7.4-java8
volumes:
- datanode:/hadoop/dfs/data
env_file:
- ./hadoop-hive.env
environment:
SERVICE_PRECONDITION: "namenode:50070"
ports:
- "50075:50075"
hive-server:
image: bde2020/hive:2.3.2-postgresql-metastore
container_name: "testhiveserver2"
env_file:
- ./hadoop-hive.env
environment:
HIVE_CORE_CONF_javax_jdo_option_ConnectionURL: "jdbc:postgresql://hive-metastore/metastore"
SERVICE_PRECONDITION: "hive-metastore:9083"
ports:
- "10000:10000"
volumes:
- ./hive_setup.sql:/hive_setup.sql
hive-metastore:
image: bde2020/hive:2.3.2-postgresql-metastore
env_file:
- ./hadoop-hive.env
command: /opt/hive/bin/hive --service metastore
environment:
SERVICE_PRECONDITION: "namenode:50070 datanode:50075 hive-metastore-postgresql:5432"
ports:
- "9083:9083"
hive-metastore-postgresql:
image: bde2020/hive-metastore-postgresql:2.3.0
# presto-coordinator:
# image: shawnzhu/prestodb:0.181
# ports:
# - "8080:8080"

volumes:
namenode:
datanode:
30 changes: 30 additions & 0 deletions metadata-ingestion/tests/integration/hive/hadoop-hive.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
HIVE_SITE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql/metastore
HIVE_SITE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver
HIVE_SITE_CONF_javax_jdo_option_ConnectionUserName=hive
HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive
HIVE_SITE_CONF_datanucleus_autoCreateSchema=false
HIVE_SITE_CONF_hive_metastore_uris=thrift://hive-metastore:9083
HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false

CORE_CONF_fs_defaultFS=hdfs://namenode:8020
CORE_CONF_hadoop_http_staticuser_user=root
CORE_CONF_hadoop_proxyuser_hue_hosts=*
CORE_CONF_hadoop_proxyuser_hue_groups=*

HDFS_CONF_dfs_webhdfs_enabled=true
HDFS_CONF_dfs_permissions_enabled=false

YARN_CONF_yarn_log___aggregation___enable=true
YARN_CONF_yarn_resourcemanager_recovery_enabled=true
YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
YARN_CONF_yarn_timeline___service_enabled=true
YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
YARN_CONF_yarn_timeline___service_hostname=historyserver
YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031
Loading

0 comments on commit 6fe663b

Please sign in to comment.