From 005a0d3ef93e49f563fe50ac87b74ec8f780df7b Mon Sep 17 00:00:00 2001 From: Xun Liu Date: Tue, 29 Aug 2023 22:35:53 +0800 Subject: [PATCH] [#241] Hive2 docker runtime environment (#267) ### What changes were proposed in this pull request? Build a hive runtime environment docker image including: These versions match the Hadoop and Hive that Graviton depends on. 1. Hadoop-2.7.3 2. Hive-2.3.9 3. Mysql 4. openJDK-8u382 5. Support `linux/ARM64` and `linux/amd86` architecture ### Why are the changes needed? The hive catalog integration test need to connect really Hive runtime environment Fix: #241 ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? Success build and push to the docker hub + https://hub.docker.com/repository/docker/datastrato/hive2/tags?page=1&ordering=last_updated image --- .gitignore | 4 +- build.gradle.kts | 5 +- dev/docker/hive2/Dockerfile | 160 +++++++++++++++++++++++++++++++ dev/docker/hive2/README.md | 18 ++++ dev/docker/hive2/build-docker.sh | 45 +++++++++ dev/docker/hive2/core-site.xml | 36 +++++++ dev/docker/hive2/hadoop-env.sh | 95 ++++++++++++++++++ dev/docker/hive2/hdfs-site.xml | 11 +++ dev/docker/hive2/hive-site.xml | 45 +++++++++ dev/docker/hive2/mapred-site.xml | 31 ++++++ dev/docker/hive2/start.sh | 44 +++++++++ dev/docker/hive2/yarn-site.xml | 16 ++++ 12 files changed, 507 insertions(+), 3 deletions(-) create mode 100644 dev/docker/hive2/Dockerfile create mode 100644 dev/docker/hive2/README.md create mode 100755 dev/docker/hive2/build-docker.sh create mode 100644 dev/docker/hive2/core-site.xml create mode 100644 dev/docker/hive2/hadoop-env.sh create mode 100644 dev/docker/hive2/hdfs-site.xml create mode 100644 dev/docker/hive2/hive-site.xml create mode 100644 dev/docker/hive2/mapred-site.xml create mode 100644 dev/docker/hive2/start.sh create mode 100644 dev/docker/hive2/yarn-site.xml diff --git a/.gitignore b/.gitignore index 4bc81cd2e7f..d4f0649c5a0 100644 --- a/.gitignore +++ b/.gitignore @@ -34,4 +34,6 @@ out/** *.iws distribution -server/src/main/resources/project.properties \ No newline at end of file +server/src/main/resources/project.properties + +dev/docker/hive2/packages \ No newline at end of file diff --git a/build.gradle.kts b/build.gradle.kts index e4ad0d23da3..12a3f2ef8e0 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -114,8 +114,9 @@ tasks.rat { inputDir.set(project.rootDir) val exclusions = mutableListOf( - // Ignore files we track but do not distribute - "**/.github/**/*", + // Ignore files we track but do not distribute + "**/.github/**/*", + "dev/docker/**/*.xml", ) // Add .gitignore excludes to the Apache Rat exclusion list. diff --git a/dev/docker/hive2/Dockerfile b/dev/docker/hive2/Dockerfile new file mode 100644 index 00000000000..945da2737f2 --- /dev/null +++ b/dev/docker/hive2/Dockerfile @@ -0,0 +1,160 @@ +# +# Copyright 2023 Datastrato. +# This software is licensed under the Apache License version 2. +# + +FROM ubuntu:16.04 +LABEL maintainer="dev@datastrato.com" + +ARG HADOOP_PACKAGE_NAME +ARG HIVE_PACKAGE_NAME + +WORKDIR / + +################################################################################ +# update and install basic tools +RUN apt-get update && apt-get upgrade -y && apt-get install --fix-missing -yq \ + git \ + libkrb5-dev \ + libmysqlclient-dev \ + libssl-dev \ + libsasl2-dev \ + libsasl2-modules-gssapi-mit \ + libsqlite3-dev \ + libtidy-0.99-0 \ + libxml2-dev \ + libxslt-dev \ + libffi-dev \ + libldap2-dev \ + python-dev \ + python-setuptools \ + libgmp3-dev \ + libz-dev \ + curl \ + software-properties-common \ + vim \ + openssh-server \ + wget \ + sudo \ + openjdk-8-jdk + +################################################################################# +## setup ssh +RUN mkdir /root/.ssh +RUN cat /dev/zero | ssh-keygen -q -N "" > /dev/null && cat /root/.ssh/id_rsa.pub > /root/.ssh/authorized_keys + +COPY packages /tmp/packages + +################################################################################ +# set environment variables +ENV JAVA_HOME=/usr/local/jdk +ENV HIVE_HOME=/usr/local/hive +ENV HADOOP_HOME=/usr/local/hadoop +ENV HADOOP_HEAPSIZE=8192 +ENV HADOOP_INSTALL=${HADOOP_HOME} +ENV HADOOP_MAPRED_HOME=${HADOOP_INSTALL} +ENV HADOOP_COMMON_HOME=${HADOOP_INSTALL} +ENV HADOOP_HDFS_HOME=${HADOOP_INSTALL} +ENV HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop +ENV YARN_HOME=${HADOOP_INSTALL} + +ENV PATH=${JAVA_HOME}/bin:${HADOOP_HOME}/bin:${HADOOP_INSTALL}/sbin:${HIVE_HOME}/bin:${PATH} +ENV CLASSPATH=${HADOOP_HOME}/lib/*:HIVE_HOME/lib/*:. +ENV LD_LIBRARY_PATH=${HADOOP_HOME}/lib/native + +################################################################################ +# add the above env for all users +RUN ARCH=$(uname -m) && \ + if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then \ + ln -s /usr/lib/jvm/java-8-openjdk-arm64 ${JAVA_HOME}; \ + else \ + ln -s /usr/lib/jvm/java-8-openjdk-amd64 ${JAVA_HOME}; \ + fi + +RUN echo "JAVA_HOME=${JAVA_HOME}" >> /etc/environment +RUN echo "HADOOP_HEAPSIZE=${HADOOP_HEAPSIZE}" >> /etc/environment +RUN echo "HADOOP_HOME=${HADOOP_HOME}" >> /etc/environment +RUN echo "HADOOP_INSTALL=${HADOOP_INSTALL}" >> /etc/environment +RUN echo "HADOOP_MAPRED_HOME=${HADOOP_MAPRED_HOME}" >> /etc/environment +RUN echo "HADOOP_COMMON_HOME=${HADOOP_COMMON_HOME}" >> /etc/environment +RUN echo "HADOOP_HDFS_HOME=${HADOOP_HDFS_HOME}" >> /etc/environment +RUN echo "HADOOP_CONF_DIR=${HADOOP_CONF_DIR}" >> /etc/environment +RUN echo "HADOOP_CLASSPATH=${JAVA_HOME}/lib/tools.jar" >> /etc/environment +RUN echo "YARN_HOME=${YARN_HOME}" >> /etc/environment +RUN echo "HIVE_HOME=${HIVE_HOME}" >> /etc/environment +RUN echo "PATH=${PATH}" >> /etc/environment +RUN echo "CLASSPATH=${CLASSPATH}" >> /etc/environment +RUN echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> /etc/environment + +################################################################################ +# install hadoop +RUN mkdir ${HADOOP_HOME} +RUN tar -xz -C ${HADOOP_HOME} --strip-components 1 -f /tmp/packages/${HADOOP_PACKAGE_NAME} + +# replace configuration templates +RUN rm -f ${HADOOP_CONF_DIR}/core-site.xml +RUN rm -f ${HADOOP_CONF_DIR}/hadoop-env.sh +RUN rm -f ${HADOOP_CONF_DIR}/hdfs-site.xml +RUN rm -f ${HADOOP_CONF_DIR}/mapred-site.xml +RUN rm -f ${HADOOP_CONF_DIR}/yarn-site.xml + +ADD core-site.xml ${HADOOP_CONF_DIR}/core-site.xml +ADD hadoop-env.sh ${HADOOP_CONF_DIR}/hadoop-env.sh +ADD hdfs-site.xml ${HADOOP_CONF_DIR}/hdfs-site.xml +ADD mapred-site.xml ${HADOOP_CONF_DIR}/mapred-site.xml +ADD yarn-site.xml ${HADOOP_CONF_DIR}/yarn-site.xml + +# format HFS +RUN ${HADOOP_HOME}/bin/hdfs namenode -format -nonInteractive + +################################################################################ +# install hive +RUN mkdir ${HIVE_HOME} +RUN tar -xz -C ${HIVE_HOME} --strip-components 1 -f /tmp/packages/${HIVE_PACKAGE_NAME} +ADD hive-site.xml ${HIVE_HOME}/conf/hive-site.xml + +################################################################################ +# install MySQL +ENV MYSQL_PWD=ds123 +RUN echo "mysql-server mysql-server/root_password password ${MYSQL_PWD}" | debconf-set-selections +RUN echo "mysql-server mysql-server/root_password_again password ${MYSQL_PWD}" | debconf-set-selections +RUN apt-get install -y mysql-server + +RUN chown -R mysql:mysql /var/lib/mysql +RUN usermod -d /var/lib/mysql/ mysql + +################################################################################ +# add mysql jdbc driver +RUN wget https://downloads.mysql.com/archives/get/p/3/file/mysql-connector-java-8.0.15.tar.gz +RUN tar -xzf mysql-connector-java-8.0.15.tar.gz +RUN cp mysql-connector-java-8.0.15/mysql-connector-java-8.0.15.jar ${HIVE_HOME}/lib +RUN rm -rf mysql-connector-java-8.0.15 mysql-connector-java-8.0.15.tar.gz + +################################################################################ +# add users and groups +RUN groupadd hdfs && groupadd hadoop && groupadd hive && groupadd mapred + +RUN useradd -g hadoop datastrato && echo "datastrato:ds123" | chpasswd && adduser datastrato sudo +RUN usermod -s /bin/bash datastrato + +RUN usermod -a -G hdfs datastrato +RUN usermod -a -G hadoop datastrato +RUN usermod -a -G hive datastrato +RUN usermod -a -G mapred datastrato + +RUN mkdir /home/datastrato +RUN chown -R datastrato:hadoop /home/datastrato + +################################################################################ +# removed install packages +RUN rm -rf /tmp/packages + +################################################################################ +# expose port +EXPOSE 8088 50070 50075 10002 10000 8888 9083 7180 22 + +################################################################################ +# create startup script and set ENTRYPOINT +WORKDIR / +ADD start.sh /usr/local/sbin +ENTRYPOINT ["/bin/bash", "/usr/local/sbin/start.sh"] \ No newline at end of file diff --git a/dev/docker/hive2/README.md b/dev/docker/hive2/README.md new file mode 100644 index 00000000000..711c6e01e2d --- /dev/null +++ b/dev/docker/hive2/README.md @@ -0,0 +1,18 @@ + +# hadoop2 +Build docker image that includes Hadoop2, Hive2 + +Build Image +=========== +./build-docker.sh + +Run container +============= +docker run --rm -m -p 8088:8088 -p 50070:50070 -p 50075:50075 -p 10000:10000 -p 10002:10002 -p 8888:8888 -p 9083:9083 -p 8022:22 datastrato/hive2:0.1.0 + +Login to the server +============= +ssh -p 8022 datastrato@localhost (password: ds123, this is a sudo user) diff --git a/dev/docker/hive2/build-docker.sh b/dev/docker/hive2/build-docker.sh new file mode 100755 index 00000000000..9f1b2ea3356 --- /dev/null +++ b/dev/docker/hive2/build-docker.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# +# Copyright 2023 Datastrato. +# This software is licensed under the Apache License version 2. +# +set -ex +bin="$(dirname "${BASH_SOURCE-$0}")" +bin="$(cd "${bin}">/dev/null; pwd)" + +# Environment variables definition +IMAGE_NAME="datastrato/hive2:0.1.0" +HADOOP_VERSION="2.7.3" +HIVE_VERSION="2.3.9" + +HADOOP_PACKAGE_NAME="hadoop-${HADOOP_VERSION}.tar.gz" +HADOOP_DOWNLOAD_URL="http://archive.apache.org/dist/hadoop/core/hadoop-${HADOOP_VERSION}/${HADOOP_PACKAGE_NAME}" + +HIVE_PACKAGE_NAME="apache-hive-${HIVE_VERSION}-bin.tar.gz" +HIVE_DOWNLOAD_URL="https://archive.apache.org/dist/hive/hive-${HIVE_VERSION}/${HIVE_PACKAGE_NAME}" + +# Prepare download packages +if [[ ! -d "${bin}/packages" ]]; then + mkdir -p "${bin}/packages" +fi + +if [ ! -f "${bin}/packages/${HADOOP_PACKAGE_NAME}" ]; then + curl -s -o "${bin}/packages/${HADOOP_PACKAGE_NAME}" ${HADOOP_DOWNLOAD_URL} +fi + +if [ ! -f "${bin}/packages/${HIVE_PACKAGE_NAME}" ]; then + curl -s -o "${bin}/packages/${HIVE_PACKAGE_NAME}" ${HIVE_DOWNLOAD_URL} +fi + +# Create multi-arch builder +BUILDER_NAME="hive2" +builders=$(docker buildx ls) +if echo "${builders}" | grep -q "${BUILDER_NAME}"; then + echo "BuildKit builder '${BUILDER_NAME}' already exists." +else + echo "BuildKit builder '${BUILDER_NAME}' does not exist." + docker buildx create --platform linux/amd64,linux/arm64 --use --name hive2 +fi + +# Option params --no-cache --push +docker buildx build --platform=linux/amd64,linux/arm64 --build-arg HADOOP_PACKAGE_NAME=${HADOOP_PACKAGE_NAME} --build-arg HIVE_PACKAGE_NAME=${HIVE_PACKAGE_NAME} --output type=docker --progress plain -t ${IMAGE_NAME} . diff --git a/dev/docker/hive2/core-site.xml b/dev/docker/hive2/core-site.xml new file mode 100644 index 00000000000..562975d7de1 --- /dev/null +++ b/dev/docker/hive2/core-site.xml @@ -0,0 +1,36 @@ + + + fs.defaultFS + hdfs://localhost:9000 + + + + name + Development Cluster + + + + hadoop.http.staticuser.user + hadoopuser + + + + hadoop.proxyuser.hive.hosts + * + + + + hadoop.proxyuser.hive.groups + * + + + + hadoop.proxyuser.root.groups + * + + + + hadoop.proxyuser.root.hosts + * + + diff --git a/dev/docker/hive2/hadoop-env.sh b/dev/docker/hive2/hadoop-env.sh new file mode 100644 index 00000000000..856be5546d8 --- /dev/null +++ b/dev/docker/hive2/hadoop-env.sh @@ -0,0 +1,95 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Set Hadoop-specific environment variables here. + +# The only required environment variable is JAVA_HOME. All others are +# optional. When running a distributed configuration it is best to +# set JAVA_HOME in this file, so that it is correctly defined on +# remote nodes. + +# The java implementation to use. +export JAVA_HOME=${JAVA_HOME} + +# The jsvc implementation to use. Jsvc is required to run secure datanodes +# that bind to privileged ports to provide authentication of data transfer +# protocol. Jsvc is not required if SASL is configured for authentication of +# data transfer protocol using non-privileged ports. +#export JSVC_HOME=${JSVC_HOME} + +export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop + +# Extra Java CLASSPATH elements. Automatically insert capacity-scheduler. +for f in ${HADOOP_HOME}/contrib/capacity-scheduler/*.jar; do + if [ "${HADOOP_CLASSPATH}" ]; then + export HADOOP_CLASSPATH=${HADOOP_CLASSPATH}:$f + else + export HADOOP_CLASSPATH=$f + fi +done + +# The maximum amount of heap to use, in MB. Default is 1000. +export HADOOP_HEAPSIZE=8192 + +# Extra Java runtime options. Empty by default. +export HADOOP_OPTS="${HADOOP_OPTS} -Djava.net.preferIPv4Stack=true -XX:MaxPermSize=512m" + +# Command specific options appended to HADOOP_OPTS when specified +export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} ${HADOOP_NAMENODE_OPTS}" +export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS ${HADOOP_DATANODE_OPTS}" + +export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} ${HADOOP_SECONDARYNAMENODE_OPTS}" + +export HADOOP_NFS3_OPTS="${HADOOP_NFS3_OPTS}" +export HADOOP_PORTMAP_OPTS="${HADOOP_PORTMAP_OPTS}" + +# The following applies to multiple commands (fs, dfs, fsck, distcp etc) +export HADOOP_CLIENT_OPTS="${HADOOP_CLIENT_OPTS}" + +# On secure datanodes, user to run the datanode as after dropping privileges. +# This **MUST** be uncommented to enable secure HDFS if using privileged ports +# to provide authentication of data transfer protocol. This **MUST NOT** be +# defined if SASL is configured for authentication of data transfer protocol +# using non-privileged ports. +export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER} + +# Where log files are stored. ${HADOOP_HOME}/logs by default. + +# Where log files are stored in the secure data environment. +export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER} + +### +# HDFS Mover specific parameters +### +# Specify the JVM options to be used when starting the HDFS Mover. +# These options will be appended to the options specified as HADOOP_OPTS +# and therefore may override any similar flags set in HADOOP_OPTS +# +# export HADOOP_MOVER_OPTS="" + +### +# Advanced Users Only! +### + +# The directory where pid files are stored. /tmp by default. +# NOTE: this should be set to a directory that can only be written to by +# the user that will run the hadoop daemons. Otherwise there is the +# potential for a symlink attack. +export HADOOP_PID_DIR=${HADOOP_PID_DIR} +export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR} + +# A string representing this instance of hadoop. ${USER} by default. +export HADOOP_IDENT_STRING=${USER} diff --git a/dev/docker/hive2/hdfs-site.xml b/dev/docker/hive2/hdfs-site.xml new file mode 100644 index 00000000000..34c603fd0e6 --- /dev/null +++ b/dev/docker/hive2/hdfs-site.xml @@ -0,0 +1,11 @@ + + + dfs.replication + 1 + + + + dfs.webhdfs.enabled + true + + diff --git a/dev/docker/hive2/hive-site.xml b/dev/docker/hive2/hive-site.xml new file mode 100644 index 00000000000..cf47751c897 --- /dev/null +++ b/dev/docker/hive2/hive-site.xml @@ -0,0 +1,45 @@ + + + hive.server2.enable.doAs + false + Disable user impersonation for HiveServer2 + + + + hive.exec.scratchdir + /tmp + Scratch space for Hive jobs + + + + mapred.child.java.opts + -Xmx4G -XX:+UseConcMarkSweepGC + Max memory for Map Reduce Jobs + + + + javax.jdo.option.ConnectionURL + jdbc:mysql://localhost/metastore_db?createDatabaseIfNotExist=true&useSSL=false + + + + javax.jdo.option.ConnectionUserName + hive + + + + javax.jdo.option.ConnectionPassword + hive + + + + javax.jdo.option.ConnectionDriverName + com.mysql.jdbc.Driver + + + + hive.metastore.warehouse.dir + hdfs://localhost:9000/user/hive/warehouse + location of default database for the warehouse + + diff --git a/dev/docker/hive2/mapred-site.xml b/dev/docker/hive2/mapred-site.xml new file mode 100644 index 00000000000..c30b5b514ad --- /dev/null +++ b/dev/docker/hive2/mapred-site.xml @@ -0,0 +1,31 @@ + + + mapreduce.framework.name + yarn + + + + mapreduce.map.memory.mb + 3072 + + + + mapreduce.reduce.memory.mb + 3072 + + + + mapreduce.map.java.opts + -Xmx3G + + + + mapreduce.reduce.java.opts + -Xmx3G + + + + mapred.child.java.opts + -Xmx3G -XX:+UseConcMarkSweepGC + + diff --git a/dev/docker/hive2/start.sh b/dev/docker/hive2/start.sh new file mode 100644 index 00000000000..2bd182117ad --- /dev/null +++ b/dev/docker/hive2/start.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# +# Copyright 2023 Datastrato. +# This software is licensed under the Apache License version 2. +# + +# start ssh +service ssh start +ssh-keyscan localhost > /root/.ssh/known_hosts +ssh-keyscan 0.0.0.0 >> /root/.ssh/known_hosts + +# start hadoop +${HADOOP_HOME}/sbin/start-all.sh + +${HADOOP_HOME}/bin/hdfs dfs -mkdir /tmp +${HADOOP_HOME}/bin/hdfs dfs -chmod 1777 /tmp +${HADOOP_HOME}/bin/hdfs dfs -mkdir -p /user/hive/warehouse +${HADOOP_HOME}/bin/hdfs dfs -chown -R hive:hive /user/hive +${HADOOP_HOME}/bin/hdfs dfs -chmod -R 775 /user/hive +${HADOOP_HOME}/bin/hdfs dfs -mkdir -p /user/datastrato +${HADOOP_HOME}/bin/hdfs dfs -chown -R datastrato:hdfs /user/datastrato +${HADOOP_HOME}/bin/hdfs dfs -chmod 755 /user/datastrato +${HADOOP_HOME}/bin/hdfs dfs -chmod -R 777 /user/hive/tmp + +# start mysql and create databases/users for hive +chown -R mysql:mysql /var/lib/mysql +usermod -d /var/lib/mysql/ mysql +service mysql start + +echo """ + CREATE USER 'hive'@'localhost' IDENTIFIED BY 'hive'; + GRANT ALL PRIVILEGES on *.* to 'hive'@'localhost' WITH GRANT OPTION; + GRANT ALL on hive.* to 'hive'@'localhost' IDENTIFIED BY 'hive'; + FLUSH PRIVILEGES; + CREATE DATABASE hive; +""" | mysql --user=root --password=${MYSQL_PWD} + +# start hive +${HIVE_HOME}/bin/schematool -initSchema -dbType mysql +${HIVE_HOME}/bin/hive --service hiveserver2 > /dev/null 2>&1 & +${HIVE_HOME}/bin/hive --service metastore > /dev/null 2>&1 & + +# persist the container +tail -f /dev/null diff --git a/dev/docker/hive2/yarn-site.xml b/dev/docker/hive2/yarn-site.xml new file mode 100644 index 00000000000..452d58e3139 --- /dev/null +++ b/dev/docker/hive2/yarn-site.xml @@ -0,0 +1,16 @@ + + + yarn.nodemanager.aux-services + mapreduce_shuffle + + + + yarn.nodemanager.pmem-check-enabled + false + + + + yarn.nodemanager.vmem-check-enabled + false + +