forked from sebastiandaberdaku/spark-with-glue-builder
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDockerfile
138 lines (123 loc) · 6.79 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# I want to build Spark with PySpark support for Python 3.10, so I need a docker image with both Python and Java.
# It is faster to start from an image with Python and install the JDK later.
FROM python:3.10.14-bookworm
# Install packages
RUN echo "deb http://ftp.de.debian.org/debian sid main" >> /etc/apt/sources.list; \
apt-get update; \
apt-get install -y --no-install-recommends openjdk-8-jdk wget patch; \
rm -rf /var/lib/apt/lists/*
# Install maven
ARG MAVEN_VERSION=3.8.8
RUN wget --quiet -O /opt/maven.tar.gz "https://apache.org/dyn/closer.lua/maven/maven-3/${MAVEN_VERSION}/binaries/apache-maven-${MAVEN_VERSION}-bin.tar.gz?action=download"; \
mkdir -p /opt/maven; \
tar zxf /opt/maven.tar.gz --strip-components=1 --directory=/opt/maven; \
rm /opt/maven.tar.gz
ENV MAVEN_HOME=/opt/maven
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
ENV PATH=$PATH:${MAVEN_HOME}/bin
WORKDIR /opt
# Download and extract the Glue Data Catalog Client
ARG SPARK_VERSION=3.5.1
RUN wget --quiet -O /opt/glue.tar.gz "https://github.com/sebastiandaberdaku/aws-glue-data-catalog-client-for-apache-hive-metastore/archive/refs/tags/v${SPARK_VERSION}.tar.gz"; \
mkdir -p /opt/glue; \
tar zxf /opt/glue.tar.gz --strip-components=1 --directory=/opt/glue; \
rm /opt/glue.tar.gz
## Patching Apache Hive and Installing It Locally
# Download and extract Apache Hive2 sources
ARG HIVE2_VERSION=2.3.9
RUN wget --quiet -O /opt/hive2.tar.gz "https://github.com/apache/hive/archive/rel/release-${HIVE2_VERSION}.tar.gz"; \
mkdir -p /opt/hive2; \
tar zxf /opt/hive2.tar.gz --strip-components=1 --directory=/opt/hive2; \
rm /opt/hive2.tar.gz
# Add the 2.3 version patch file
COPY ./HIVE-12679.branch-2.3.patch /opt/hive2
# conjars repository is dead, mirroring to another repo to download jars
COPY ./.mvn/ /opt/hive2/.mvn/
RUN cd /opt/hive2; \
patch -p0 <HIVE-12679.branch-2.3.patch; \
mvn -T $(nproc) clean install -DskipTests
# Download and extract Apache Hive3 sources
ARG HIVE3_VERSION=3.1.3
RUN wget --quiet -O /opt/hive3.tar.gz "https://github.com/apache/hive/archive/rel/release-${HIVE3_VERSION}.tar.gz"; \
mkdir -p /opt/hive3; \
tar zxf /opt/hive3.tar.gz --strip-components=1 --directory=/opt/hive3; \
rm /opt/hive3.tar.gz
# conjars repository is dead, mirroring to another repo to download jars
COPY ./.mvn/ /opt/hive3/.mvn/
# Continue with patching the 3.1 branch:
RUN cp /opt/glue/branch_3.1.patch /opt/hive3; \
cd /opt/hive3; \
patch -p1 --merge <branch_3.1.patch; \
mvn -T $(nproc) clean install -DskipTests
## Building the Glue Data Catalog Client
# Now with Hive patched and installed, build the glue client
# Adding the .mvn folder content fixes the missing conjars repository.
COPY ./.mvn/ /opt/glue/.mvn/
# All clients must be built from the root directory of the AWS Glue Data Catalog Client repository.
# This will build both the Hive and Spark clients and necessary dependencies.
ARG HADOOP_VERSION=3.3.4
RUN cd /opt/glue; \
mvn -T $(nproc) clean install \
-DskipTests \
-Dspark-hive.version="${HIVE2_VERSION}" \
-Dhive3.version="${HIVE3_VERSION}" \
-Dhadoop.version="${HADOOP_VERSION}"
## Build Spark
# Fetch the Spark sources
RUN wget --quiet -O /opt/spark.tar.gz "https://github.com/apache/spark/archive/refs/tags/v${SPARK_VERSION}.tar.gz"; \
mkdir -p /opt/spark; \
tar zxf /opt/spark.tar.gz --strip-components=1 --directory=/opt/spark; \
rm /opt/spark.tar.gz
# Setting up Maven's Memory Usage
ENV MAKEFLAGS="-j$(nproc)"
ENV MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g"
# Patch (see: https://issues.apache.org/jira/browse/SPARK-45201) and build a runnable Spark distribution
COPY "./spark-${SPARK_VERSION}.patch" /opt/spark/
ARG SCALA_VERSION=2.12
RUN cd /opt/spark; \
patch -p1 <"spark-${SPARK_VERSION}.patch"; \
./dev/make-distribution.sh \
--name spark \
--pip \
-P"scala-${SCALA_VERSION}" \
-Pconnect \
-Pkubernetes \
-Phive \
-Phive-thriftserver \
-P"hadoop-${HADOOP_VERSION%%.*}" \
-Dhadoop.version="${HADOOP_VERSION}" \
-Dhive.version="${HIVE2_VERSION}" \
-Dhive23.version="${HIVE2_VERSION}" \
-Dhive.version.short="${HIVE2_VERSION%.*}"
ARG SPARK_DIST_DIR=/opt/spark/dist
# IMPORTANT! We must delete the spark-connect-commom jar from the jars directory!
# see: https://issues.apache.org/jira/browse/SPARK-45201
RUN rm "${SPARK_DIST_DIR}/jars/spark-connect-common_${SCALA_VERSION}-${SPARK_VERSION}.jar"
# Copy the glue client jars to the spark jars directory
# We are only interested in the AWS Glue Spark Client
RUN cp "/opt/glue/aws-glue-datacatalog-spark-client/target/aws-glue-datacatalog-spark-client-${SPARK_VERSION}.jar" "${SPARK_DIST_DIR}/jars/"
# The following steps are optional
# I am downloading these jars directly to the docker image in order to avoid having to download them when Spark starts up.
# Download the other jars
# AWS Java SDK bundle library
ARG AWS_JAVA_SDK_VERSION=1.12.262
RUN wget --quiet -P "${SPARK_DIST_DIR}/jars/" "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_JAVA_SDK_VERSION}/aws-java-sdk-bundle-${AWS_JAVA_SDK_VERSION}.jar"
# Hadoop AWS library
RUN wget --quiet -P "${SPARK_DIST_DIR}/jars/" "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar"
RUN wget --quiet -P "${SPARK_DIST_DIR}/jars/" "https://repo1.maven.org/maven2/org/wildfly/openssl/wildfly-openssl/1.0.7.Final/wildfly-openssl-1.0.7.Final.jar"
# PostgreSQL library
ARG POSTGRES_VERSION=42.6.0
RUN wget --quiet -P "${SPARK_DIST_DIR}/jars/" "https://repo1.maven.org/maven2/org/postgresql/postgresql/${POSTGRES_VERSION}/postgresql-${POSTGRES_VERSION}.jar"
RUN wget --quiet -P "${SPARK_DIST_DIR}/jars/" "https://repo1.maven.org/maven2/org/checkerframework/checker-qual/3.31.0/checker-qual-3.31.0.jar"
# Delta IO libraries
ARG DELTA_VERSION=3.2.0
RUN wget --quiet -P "${SPARK_DIST_DIR}/jars/" "https://repo1.maven.org/maven2/io/delta/delta-spark_${SCALA_VERSION}/${DELTA_VERSION}/delta-spark_${SCALA_VERSION}-${DELTA_VERSION}.jar"
RUN wget --quiet -P "${SPARK_DIST_DIR}/jars/" "https://repo1.maven.org/maven2/org/antlr/antlr4-runtime/4.9.3/antlr4-runtime-4.9.3.jar"
RUN wget --quiet -P "${SPARK_DIST_DIR}/jars/" "https://repo1.maven.org/maven2/io/delta/delta-storage/${DELTA_VERSION}/delta-storage-${DELTA_VERSION}.jar"
RUN wget --quiet -P "${SPARK_DIST_DIR}/jars/" "https://repo1.maven.org/maven2/io/delta/delta-storage-s3-dynamodb/${DELTA_VERSION}/delta-storage-s3-dynamodb-${DELTA_VERSION}.jar"
# Download and install Hadoop native libraries
ARG HADOOP_HOME=/opt/hadoop
RUN wget "https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz" -O /opt/hadoop.tar.gz; \
mkdir -p ${HADOOP_HOME}; \
tar zxf /opt/hadoop.tar.gz --strip-components=1 --directory="${HADOOP_HOME}"; \
rm /opt/hadoop.tar.gz