Skip to content

Commit

Permalink
[#241] Hive2 docker runtime environment (#267)
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?

Build a hive runtime environment docker image including:
These versions match the Hadoop and Hive that Graviton depends on.

1. Hadoop-2.7.3
2. Hive-2.3.9
3. Mysql
4. openJDK-8u382
5. Support `linux/ARM64` and `linux/amd86` architecture

### Why are the changes needed?

The hive catalog integration test need to connect really Hive runtime
environment

Fix: #241 

### Does this PR introduce _any_ user-facing change?

N/A

### How was this patch tested?

Success build and push to the docker hub
+
https://hub.docker.com/repository/docker/datastrato/hive2/tags?page=1&ordering=last_updated

<img width="791" alt="image"
src="https://github.com/datastrato/graviton/assets/3677382/7aee52b5-44f4-4f7c-ae4a-8703c7daca94">
  • Loading branch information
xunliu authored Aug 29, 2023
1 parent b69d459 commit 005a0d3
Show file tree
Hide file tree
Showing 12 changed files with 507 additions and 3 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,6 @@ out/**
*.iws

distribution
server/src/main/resources/project.properties
server/src/main/resources/project.properties

dev/docker/hive2/packages
5 changes: 3 additions & 2 deletions build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,9 @@ tasks.rat {
inputDir.set(project.rootDir)

val exclusions = mutableListOf(
// Ignore files we track but do not distribute
"**/.github/**/*",
// Ignore files we track but do not distribute
"**/.github/**/*",
"dev/docker/**/*.xml",
)

// Add .gitignore excludes to the Apache Rat exclusion list.
Expand Down
160 changes: 160 additions & 0 deletions dev/docker/hive2/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
#
# Copyright 2023 Datastrato.
# This software is licensed under the Apache License version 2.
#

FROM ubuntu:16.04
LABEL maintainer="[email protected]"

ARG HADOOP_PACKAGE_NAME
ARG HIVE_PACKAGE_NAME

WORKDIR /

################################################################################
# update and install basic tools
RUN apt-get update && apt-get upgrade -y && apt-get install --fix-missing -yq \
git \
libkrb5-dev \
libmysqlclient-dev \
libssl-dev \
libsasl2-dev \
libsasl2-modules-gssapi-mit \
libsqlite3-dev \
libtidy-0.99-0 \
libxml2-dev \
libxslt-dev \
libffi-dev \
libldap2-dev \
python-dev \
python-setuptools \
libgmp3-dev \
libz-dev \
curl \
software-properties-common \
vim \
openssh-server \
wget \
sudo \
openjdk-8-jdk

#################################################################################
## setup ssh
RUN mkdir /root/.ssh
RUN cat /dev/zero | ssh-keygen -q -N "" > /dev/null && cat /root/.ssh/id_rsa.pub > /root/.ssh/authorized_keys

COPY packages /tmp/packages

################################################################################
# set environment variables
ENV JAVA_HOME=/usr/local/jdk
ENV HIVE_HOME=/usr/local/hive
ENV HADOOP_HOME=/usr/local/hadoop
ENV HADOOP_HEAPSIZE=8192
ENV HADOOP_INSTALL=${HADOOP_HOME}
ENV HADOOP_MAPRED_HOME=${HADOOP_INSTALL}
ENV HADOOP_COMMON_HOME=${HADOOP_INSTALL}
ENV HADOOP_HDFS_HOME=${HADOOP_INSTALL}
ENV HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
ENV YARN_HOME=${HADOOP_INSTALL}

ENV PATH=${JAVA_HOME}/bin:${HADOOP_HOME}/bin:${HADOOP_INSTALL}/sbin:${HIVE_HOME}/bin:${PATH}
ENV CLASSPATH=${HADOOP_HOME}/lib/*:HIVE_HOME/lib/*:.
ENV LD_LIBRARY_PATH=${HADOOP_HOME}/lib/native

################################################################################
# add the above env for all users
RUN ARCH=$(uname -m) && \
if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then \
ln -s /usr/lib/jvm/java-8-openjdk-arm64 ${JAVA_HOME}; \
else \
ln -s /usr/lib/jvm/java-8-openjdk-amd64 ${JAVA_HOME}; \
fi

RUN echo "JAVA_HOME=${JAVA_HOME}" >> /etc/environment
RUN echo "HADOOP_HEAPSIZE=${HADOOP_HEAPSIZE}" >> /etc/environment
RUN echo "HADOOP_HOME=${HADOOP_HOME}" >> /etc/environment
RUN echo "HADOOP_INSTALL=${HADOOP_INSTALL}" >> /etc/environment
RUN echo "HADOOP_MAPRED_HOME=${HADOOP_MAPRED_HOME}" >> /etc/environment
RUN echo "HADOOP_COMMON_HOME=${HADOOP_COMMON_HOME}" >> /etc/environment
RUN echo "HADOOP_HDFS_HOME=${HADOOP_HDFS_HOME}" >> /etc/environment
RUN echo "HADOOP_CONF_DIR=${HADOOP_CONF_DIR}" >> /etc/environment
RUN echo "HADOOP_CLASSPATH=${JAVA_HOME}/lib/tools.jar" >> /etc/environment
RUN echo "YARN_HOME=${YARN_HOME}" >> /etc/environment
RUN echo "HIVE_HOME=${HIVE_HOME}" >> /etc/environment
RUN echo "PATH=${PATH}" >> /etc/environment
RUN echo "CLASSPATH=${CLASSPATH}" >> /etc/environment
RUN echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> /etc/environment

################################################################################
# install hadoop
RUN mkdir ${HADOOP_HOME}
RUN tar -xz -C ${HADOOP_HOME} --strip-components 1 -f /tmp/packages/${HADOOP_PACKAGE_NAME}

# replace configuration templates
RUN rm -f ${HADOOP_CONF_DIR}/core-site.xml
RUN rm -f ${HADOOP_CONF_DIR}/hadoop-env.sh
RUN rm -f ${HADOOP_CONF_DIR}/hdfs-site.xml
RUN rm -f ${HADOOP_CONF_DIR}/mapred-site.xml
RUN rm -f ${HADOOP_CONF_DIR}/yarn-site.xml

ADD core-site.xml ${HADOOP_CONF_DIR}/core-site.xml
ADD hadoop-env.sh ${HADOOP_CONF_DIR}/hadoop-env.sh
ADD hdfs-site.xml ${HADOOP_CONF_DIR}/hdfs-site.xml
ADD mapred-site.xml ${HADOOP_CONF_DIR}/mapred-site.xml
ADD yarn-site.xml ${HADOOP_CONF_DIR}/yarn-site.xml

# format HFS
RUN ${HADOOP_HOME}/bin/hdfs namenode -format -nonInteractive

################################################################################
# install hive
RUN mkdir ${HIVE_HOME}
RUN tar -xz -C ${HIVE_HOME} --strip-components 1 -f /tmp/packages/${HIVE_PACKAGE_NAME}
ADD hive-site.xml ${HIVE_HOME}/conf/hive-site.xml

################################################################################
# install MySQL
ENV MYSQL_PWD=ds123
RUN echo "mysql-server mysql-server/root_password password ${MYSQL_PWD}" | debconf-set-selections
RUN echo "mysql-server mysql-server/root_password_again password ${MYSQL_PWD}" | debconf-set-selections
RUN apt-get install -y mysql-server

RUN chown -R mysql:mysql /var/lib/mysql
RUN usermod -d /var/lib/mysql/ mysql

################################################################################
# add mysql jdbc driver
RUN wget https://downloads.mysql.com/archives/get/p/3/file/mysql-connector-java-8.0.15.tar.gz
RUN tar -xzf mysql-connector-java-8.0.15.tar.gz
RUN cp mysql-connector-java-8.0.15/mysql-connector-java-8.0.15.jar ${HIVE_HOME}/lib
RUN rm -rf mysql-connector-java-8.0.15 mysql-connector-java-8.0.15.tar.gz

################################################################################
# add users and groups
RUN groupadd hdfs && groupadd hadoop && groupadd hive && groupadd mapred

RUN useradd -g hadoop datastrato && echo "datastrato:ds123" | chpasswd && adduser datastrato sudo
RUN usermod -s /bin/bash datastrato

RUN usermod -a -G hdfs datastrato
RUN usermod -a -G hadoop datastrato
RUN usermod -a -G hive datastrato
RUN usermod -a -G mapred datastrato

RUN mkdir /home/datastrato
RUN chown -R datastrato:hadoop /home/datastrato

################################################################################
# removed install packages
RUN rm -rf /tmp/packages

################################################################################
# expose port
EXPOSE 8088 50070 50075 10002 10000 8888 9083 7180 22

################################################################################
# create startup script and set ENTRYPOINT
WORKDIR /
ADD start.sh /usr/local/sbin
ENTRYPOINT ["/bin/bash", "/usr/local/sbin/start.sh"]
18 changes: 18 additions & 0 deletions dev/docker/hive2/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<!--
Copyright 2023 Datastrato.
This software is licensed under the Apache License version 2.
-->
# hadoop2
Build docker image that includes Hadoop2, Hive2

Build Image
===========
./build-docker.sh

Run container
=============
docker run --rm -m -p 8088:8088 -p 50070:50070 -p 50075:50075 -p 10000:10000 -p 10002:10002 -p 8888:8888 -p 9083:9083 -p 8022:22 datastrato/hive2:0.1.0

Login to the server
=============
ssh -p 8022 datastrato@localhost (password: ds123, this is a sudo user)
45 changes: 45 additions & 0 deletions dev/docker/hive2/build-docker.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
#
# Copyright 2023 Datastrato.
# This software is licensed under the Apache License version 2.
#
set -ex
bin="$(dirname "${BASH_SOURCE-$0}")"
bin="$(cd "${bin}">/dev/null; pwd)"

# Environment variables definition
IMAGE_NAME="datastrato/hive2:0.1.0"
HADOOP_VERSION="2.7.3"
HIVE_VERSION="2.3.9"

HADOOP_PACKAGE_NAME="hadoop-${HADOOP_VERSION}.tar.gz"
HADOOP_DOWNLOAD_URL="http://archive.apache.org/dist/hadoop/core/hadoop-${HADOOP_VERSION}/${HADOOP_PACKAGE_NAME}"

HIVE_PACKAGE_NAME="apache-hive-${HIVE_VERSION}-bin.tar.gz"
HIVE_DOWNLOAD_URL="https://archive.apache.org/dist/hive/hive-${HIVE_VERSION}/${HIVE_PACKAGE_NAME}"

# Prepare download packages
if [[ ! -d "${bin}/packages" ]]; then
mkdir -p "${bin}/packages"
fi

if [ ! -f "${bin}/packages/${HADOOP_PACKAGE_NAME}" ]; then
curl -s -o "${bin}/packages/${HADOOP_PACKAGE_NAME}" ${HADOOP_DOWNLOAD_URL}
fi

if [ ! -f "${bin}/packages/${HIVE_PACKAGE_NAME}" ]; then
curl -s -o "${bin}/packages/${HIVE_PACKAGE_NAME}" ${HIVE_DOWNLOAD_URL}
fi

# Create multi-arch builder
BUILDER_NAME="hive2"
builders=$(docker buildx ls)
if echo "${builders}" | grep -q "${BUILDER_NAME}"; then
echo "BuildKit builder '${BUILDER_NAME}' already exists."
else
echo "BuildKit builder '${BUILDER_NAME}' does not exist."
docker buildx create --platform linux/amd64,linux/arm64 --use --name hive2
fi

# Option params --no-cache --push
docker buildx build --platform=linux/amd64,linux/arm64 --build-arg HADOOP_PACKAGE_NAME=${HADOOP_PACKAGE_NAME} --build-arg HIVE_PACKAGE_NAME=${HIVE_PACKAGE_NAME} --output type=docker --progress plain -t ${IMAGE_NAME} .
36 changes: 36 additions & 0 deletions dev/docker/hive2/core-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>

<property>
<name>name</name>
<value>Development Cluster</value>
</property>

<property>
<name>hadoop.http.staticuser.user</name>
<value>hadoopuser</value>
</property>

<property>
<name>hadoop.proxyuser.hive.hosts</name>
<value>*</value>
</property>

<property>
<name>hadoop.proxyuser.hive.groups</name>
<value>*</value>
</property>

<property>
<name>hadoop.proxyuser.root.groups</name>
<value>*</value>
</property>

<property>
<name>hadoop.proxyuser.root.hosts</name>
<value>*</value>
</property>
</configuration>
95 changes: 95 additions & 0 deletions dev/docker/hive2/hadoop-env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Set Hadoop-specific environment variables here.

# The only required environment variable is JAVA_HOME. All others are
# optional. When running a distributed configuration it is best to
# set JAVA_HOME in this file, so that it is correctly defined on
# remote nodes.

# The java implementation to use.
export JAVA_HOME=${JAVA_HOME}

# The jsvc implementation to use. Jsvc is required to run secure datanodes
# that bind to privileged ports to provide authentication of data transfer
# protocol. Jsvc is not required if SASL is configured for authentication of
# data transfer protocol using non-privileged ports.
#export JSVC_HOME=${JSVC_HOME}

export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop

# Extra Java CLASSPATH elements. Automatically insert capacity-scheduler.
for f in ${HADOOP_HOME}/contrib/capacity-scheduler/*.jar; do
if [ "${HADOOP_CLASSPATH}" ]; then
export HADOOP_CLASSPATH=${HADOOP_CLASSPATH}:$f
else
export HADOOP_CLASSPATH=$f
fi
done

# The maximum amount of heap to use, in MB. Default is 1000.
export HADOOP_HEAPSIZE=8192

# Extra Java runtime options. Empty by default.
export HADOOP_OPTS="${HADOOP_OPTS} -Djava.net.preferIPv4Stack=true -XX:MaxPermSize=512m"

# Command specific options appended to HADOOP_OPTS when specified
export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} ${HADOOP_NAMENODE_OPTS}"
export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS ${HADOOP_DATANODE_OPTS}"

export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} ${HADOOP_SECONDARYNAMENODE_OPTS}"

export HADOOP_NFS3_OPTS="${HADOOP_NFS3_OPTS}"
export HADOOP_PORTMAP_OPTS="${HADOOP_PORTMAP_OPTS}"

# The following applies to multiple commands (fs, dfs, fsck, distcp etc)
export HADOOP_CLIENT_OPTS="${HADOOP_CLIENT_OPTS}"

# On secure datanodes, user to run the datanode as after dropping privileges.
# This **MUST** be uncommented to enable secure HDFS if using privileged ports
# to provide authentication of data transfer protocol. This **MUST NOT** be
# defined if SASL is configured for authentication of data transfer protocol
# using non-privileged ports.
export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER}

# Where log files are stored. ${HADOOP_HOME}/logs by default.

# Where log files are stored in the secure data environment.
export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER}

###
# HDFS Mover specific parameters
###
# Specify the JVM options to be used when starting the HDFS Mover.
# These options will be appended to the options specified as HADOOP_OPTS
# and therefore may override any similar flags set in HADOOP_OPTS
#
# export HADOOP_MOVER_OPTS=""

###
# Advanced Users Only!
###

# The directory where pid files are stored. /tmp by default.
# NOTE: this should be set to a directory that can only be written to by
# the user that will run the hadoop daemons. Otherwise there is the
# potential for a symlink attack.
export HADOOP_PID_DIR=${HADOOP_PID_DIR}
export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR}

# A string representing this instance of hadoop. ${USER} by default.
export HADOOP_IDENT_STRING=${USER}
11 changes: 11 additions & 0 deletions dev/docker/hive2/hdfs-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>

<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>
</configuration>
Loading

0 comments on commit 005a0d3

Please sign in to comment.