Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Hive metadata ETL process #9

Merged
merged 2 commits into from
Dec 21, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions backend-service/app/actors/EtlJobFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import java.util.Properties;
import metadata.etl.EtlJob;
import metadata.etl.dataset.hdfs.HdfsMetadataEtl;
import metadata.etl.dataset.hive.HiveMetadataEtl;
import metadata.etl.dataset.teradata.TeradataMetadataEtl;
import metadata.etl.git.GitMetadataEtl;
import metadata.etl.lineage.AzLineageMetadataEtl;
Expand Down Expand Up @@ -49,6 +50,8 @@ public static EtlJob getEtlJob(EtlJobName etlJobName, Integer refId, Long whExec
return new LdapEtl(refId, whExecId, properties);
case GIT_MEDATA_ETL:
return new GitMetadataEtl(refId, whExecId, properties);
case HIVE_DATASET_METADATA_ETL:
return new HiveMetadataEtl(refId, whExecId, properties);
default:
throw new UnsupportedOperationException("Unsupported job type: " + etlJobName);
}
Expand Down
1 change: 1 addition & 0 deletions backend-service/app/models/EtlJobName.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ public enum EtlJobName {
HADOOP_DATASET_OWNER_ETL(EtlType.OWNER, RefIdType.DB),
LDAP_USER_ETL(EtlType.LDAP, RefIdType.APP),
GIT_MEDATA_ETL(EtlType.VCS, RefIdType.APP),
HIVE_DATASET_METADATA_ETL(EtlType.DATASET, RefIdType.DB),
;

EtlType etlType;
Expand Down
4 changes: 3 additions & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -69,5 +69,7 @@ subprojects {
"play" : "com.typesafe.play:play_2.10:2.2.4",
"play_ebean" : "com.typesafe.play:play-java-ebean_2.10:2.2.4",
"play_java_jdbc" : "com.typesafe.play:play-java-jdbc_2.10:2.2.4",
"play_cache" : "com.typesafe.play:play-cache_2.10:2.2.4"]
"play_cache" : "com.typesafe.play:play-cache_2.10:2.2.4",
"hive_exec" : "org.apache.hive:hive-exec:1.2.1"
]
}
1 change: 1 addition & 0 deletions metadata-etl/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ dependencies {
compile externalDependency.akka
compile externalDependency.slf4j_api
compile externalDependency.slf4j_log4j
compile externalDependency.hive_exec
compile files("extralibs/terajdbc4-15.00.00.20.jar")
compile files("extralibs/tdgssconfig-15.00.00.20.jar")
// compile externalDependency.jython
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package metadata.etl.dataset.hive;

import java.io.InputStream;
import java.util.Properties;
import metadata.etl.EtlJob;


/**
* Created by zsun on 11/16/15.
*/
public class HiveMetadataEtl extends EtlJob {

@Deprecated
public HiveMetadataEtl(int dbId, long whExecId) {
super(null, dbId, whExecId);
}

public HiveMetadataEtl(int dbId, long whExecId, Properties prop) {
super(null, dbId, whExecId, prop);
}


@Override
public void extract()
throws Exception {
logger.info("In Hive metadata ETL, launch extract jython scripts");
InputStream inputStream = classLoader.getResourceAsStream("jython/HiveExtract.py");
//logger.info("before call scripts " + interpreter.getSystemState().argv);
interpreter.execfile(inputStream);
inputStream.close();
}

@Override
public void transform()
throws Exception {
logger.info("In Hive metadata ETL, launch transform jython scripts");
InputStream inputStream = classLoader.getResourceAsStream("jython/HiveTransform.py");
interpreter.execfile(inputStream);
inputStream.close();
}

@Override
public void load()
throws Exception {
logger.info("In Hive metadata ETL, launch load jython scripts");
InputStream inputStream = classLoader.getResourceAsStream("jython/HiveLoad.py");
interpreter.execfile(inputStream);
inputStream.close();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package metadata.etl.dataset.hive;

import java.util.TreeSet;
import org.apache.hadoop.hive.ql.tools.LineageInfo;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
* Created by zsun on 12/14/15.
*/
public class HiveViewDependency {
final Logger logger = LoggerFactory.getLogger(getClass());
LineageInfo lineageInfoTool;

public HiveViewDependency() {
lineageInfoTool = new LineageInfo();
}
public String[] getViewDependency(String hiveQl) {
try {
lineageInfoTool.getLineageInfo(hiveQl);
TreeSet<String> inputs = lineageInfoTool.getInputTableList();
return inputs.toArray(new String[inputs.size()]);
} catch (Exception e) {
logger.error("Sql statements : \n" + hiveQl + "\n parse ERROR, will return an empty String array");
logger.error(String.valueOf(e.getCause()));
return new String[]{};
}
}
}
10 changes: 10 additions & 0 deletions metadata-etl/src/main/resources/application.properties.template
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,13 @@ ldap.group.search.return.attributes=
git.host=
git.project.whitelist=

# hive metastore
hive.metastore.jdbc.url=
hive.metastore.jdbc.driver=
hive.metstore.username=
hive.metastore.password=

hive.schema_json_file=
#hive.sample_csv=
hive.schema_csv_file=
hive.field_metadata=
2 changes: 1 addition & 1 deletion metadata-etl/src/main/resources/jython/HdfsLoad.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def load_metadata(self):

LOAD DATA LOCAL INFILE '{source_file}'
INTO TABLE stg_dict_dataset
FIELDS TERMINATED BY '\Z' ESCAPED BY '\0'
FIELDS TERMINATED BY '\Z' ESCAPED BY '\\'
(`name`, `schema`, properties, fields, urn, source, sample_partition_full_path, source_created_time, source_modified_time)
SET db_id = {db_id},
-- TODO storage_type = 'Avro',
Expand Down
Loading