From 2a2a190819315a6d67c8d5c83cbef216cbaff88c Mon Sep 17 00:00:00 2001 From: Qi Yu Date: Mon, 22 Jul 2024 15:08:50 +0800 Subject: [PATCH] [#4111] Limit port ranges in docker to reduce possible port conflicts for Hive docker. (#4106) ### What changes were proposed in this pull request? 1. Increase the timeout time for checking if Hive is writable from the 30s to 150 seconds. 2. Add some log information. 3. Fix potential issues with port occupation for DataNode and NameNode. ### Why are the changes needed? To improve CI stability. Fix: #4111 ### Does this PR introduce _any_ user-facing change? N/A. ### How was this patch tested? Existing test. --- catalogs/catalog-hadoop/build.gradle.kts | 2 +- catalogs/catalog-hive/build.gradle.kts | 2 +- .../build.gradle.kts | 2 +- dev/docker/kerberos-hive/Dockerfile | 3 +- dev/docker/kerberos-hive/start.sh | 32 +++++++++++++++---- .../test/container/BaseContainer.java | 10 +++++- .../test/container/HiveContainer.java | 6 ++++ 7 files changed, 45 insertions(+), 12 deletions(-) diff --git a/catalogs/catalog-hadoop/build.gradle.kts b/catalogs/catalog-hadoop/build.gradle.kts index 4547da9200a..e7659ef7b5a 100644 --- a/catalogs/catalog-hadoop/build.gradle.kts +++ b/catalogs/catalog-hadoop/build.gradle.kts @@ -126,7 +126,7 @@ tasks.test { doFirst { environment("GRAVITINO_CI_HIVE_DOCKER_IMAGE", "datastrato/gravitino-ci-hive:0.1.13") - environment("GRAVITINO_CI_KERBEROS_HIVE_DOCKER_IMAGE", "datastrato/gravitino-ci-kerberos-hive:0.1.2") + environment("GRAVITINO_CI_KERBEROS_HIVE_DOCKER_IMAGE", "datastrato/gravitino-ci-kerberos-hive:0.1.4") } val init = project.extra.get("initIntegrationTest") as (Test) -> Unit diff --git a/catalogs/catalog-hive/build.gradle.kts b/catalogs/catalog-hive/build.gradle.kts index 9930be54257..1a8a9015a05 100644 --- a/catalogs/catalog-hive/build.gradle.kts +++ b/catalogs/catalog-hive/build.gradle.kts @@ -179,7 +179,7 @@ tasks.test { doFirst { environment("GRAVITINO_CI_HIVE_DOCKER_IMAGE", "datastrato/gravitino-ci-hive:0.1.13") - environment("GRAVITINO_CI_KERBEROS_HIVE_DOCKER_IMAGE", "datastrato/gravitino-ci-kerberos-hive:0.1.2") + environment("GRAVITINO_CI_KERBEROS_HIVE_DOCKER_IMAGE", "datastrato/gravitino-ci-kerberos-hive:0.1.4") } val init = project.extra.get("initIntegrationTest") as (Test) -> Unit diff --git a/catalogs/catalog-lakehouse-iceberg/build.gradle.kts b/catalogs/catalog-lakehouse-iceberg/build.gradle.kts index 4f2a606c8a0..1d0eb242e35 100644 --- a/catalogs/catalog-lakehouse-iceberg/build.gradle.kts +++ b/catalogs/catalog-lakehouse-iceberg/build.gradle.kts @@ -180,7 +180,7 @@ tasks.test { doFirst { environment("GRAVITINO_CI_HIVE_DOCKER_IMAGE", "datastrato/gravitino-ci-hive:0.1.13") - environment("GRAVITINO_CI_KERBEROS_HIVE_DOCKER_IMAGE", "datastrato/gravitino-ci-kerberos-hive:0.1.2") + environment("GRAVITINO_CI_KERBEROS_HIVE_DOCKER_IMAGE", "datastrato/gravitino-ci-kerberos-hive:0.1.4") } val init = project.extra.get("initIntegrationTest") as (Test) -> Unit diff --git a/dev/docker/kerberos-hive/Dockerfile b/dev/docker/kerberos-hive/Dockerfile index 92cb3491131..ae44ad6b7e5 100644 --- a/dev/docker/kerberos-hive/Dockerfile +++ b/dev/docker/kerberos-hive/Dockerfile @@ -58,7 +58,8 @@ RUN apt-get update && apt-get upgrade -y && apt-get install --fix-missing -yq \ krb5-admin-server \ krb5-user \ krb5-config \ - jsvc + jsvc \ + net-tools ################################################################################# ## setup ssh diff --git a/dev/docker/kerberos-hive/start.sh b/dev/docker/kerberos-hive/start.sh index 4254fcd4375..b457a3c5301 100644 --- a/dev/docker/kerberos-hive/start.sh +++ b/dev/docker/kerberos-hive/start.sh @@ -79,22 +79,36 @@ echo "Starting HDFS..." echo "Starting NameNode..." ${HADOOP_HOME}/sbin/hadoop-daemon.sh start namenode +# Check if the nameNode is running +ps -ef | grep NameNode | grep -v "grep" +if [[ $? -ne 0 ]]; then + echo "NameNode failed to start, please check the logs" + echo "HDFS NameNode log start---------------------------" + cat ${HADOOP_HOME}/logs/*.log + cat ${HADOOP_HOME}/logs/*.out + echo "HDFS NameNode log end-----------------------------" + exit 1 +fi + + echo "Starting DataNode..." ${HADOOP_HOME}/sbin/start-secure-dns.sh sleep 5 # Check if the DataNode is running -ps -ef | grep DataNode | grep -v "color=auto" +ps -ef | grep DataNode | grep -v "grep" if [[ $? -ne 0 ]]; then echo "DataNode failed to start, please check the logs" - ehco "HDFS DataNode log start----------------------------" - cat ${HADOOP_HOME}/bin/logs/hadoop-root-datanode-*.log + echo "HDFS DataNode log start---------------------------" + cat ${HADOOP_HOME}/logs/*.log + cat ${HADOOP_HOME}/logs/*.out + echo "HDFS DataNode log end-----------------------------" exit 1 fi retry_times=0 ready=0 -while [[ ${retry_times} -lt 10 ]]; do +while [[ ${retry_times} -lt 15 ]]; do hdfs_ready=$(hdfs dfsadmin -report | grep "Live datanodes" | awk '{print $3}') if [[ ${hdfs_ready} == "(1):" ]]; then echo "HDFS is ready, retry_times = ${retry_times}" @@ -106,9 +120,13 @@ while [[ ${retry_times} -lt 10 ]]; do done if [[ ${ready} -ne 1 ]]; then - echo "HDFS is not ready" - ehco "HDFS DataNode log start---------------------------" - cat ${HADOOP_HOME}/bin/logs/hadoop-root-datanode-*.log + echo "HDFS is not ready, execute log:" + ps -ef | grep DataNode | grep -v "grep" + hdfs dfsadmin -report + echo "HDFS DataNode log start---------------------------" + cat ${HADOOP_HOME}/logs/*.log + cat ${HADOOP_HOME}/logs/*.out + echo "HDFS DataNode log end-----------------------------" exit 1 fi diff --git a/integration-test-common/src/test/java/org/apache/gravitino/integration/test/container/BaseContainer.java b/integration-test-common/src/test/java/org/apache/gravitino/integration/test/container/BaseContainer.java index 93b0ea157ea..192ba2c0cb5 100644 --- a/integration-test-common/src/test/java/org/apache/gravitino/integration/test/container/BaseContainer.java +++ b/integration-test-common/src/test/java/org/apache/gravitino/integration/test/container/BaseContainer.java @@ -29,6 +29,7 @@ import com.google.common.collect.ImmutableSet; import java.io.IOException; import java.time.Duration; +import java.util.Collections; import java.util.Map; import java.util.Optional; import java.util.Set; @@ -72,7 +73,14 @@ protected BaseContainer( Map filesToMount, Map envVars, Optional network) { - this.container = new GenericContainer<>(requireNonNull(image, "image is null")); + this.container = + new GenericContainer<>(requireNonNull(image, "image is null")) + .withCreateContainerCmdModifier( + cmd -> + cmd.getHostConfig() + .withSysctls( + Collections.singletonMap( + "net.ipv4.ip_local_port_range", "20000 40000"))); this.ports = requireNonNull(ports, "ports is null"); this.hostName = requireNonNull(hostName, "hostName is null"); this.extraHosts = extraHosts; diff --git a/integration-test-common/src/test/java/org/apache/gravitino/integration/test/container/HiveContainer.java b/integration-test-common/src/test/java/org/apache/gravitino/integration/test/container/HiveContainer.java index 6fbcfb16e07..c08094e7bfb 100644 --- a/integration-test-common/src/test/java/org/apache/gravitino/integration/test/container/HiveContainer.java +++ b/integration-test-common/src/test/java/org/apache/gravitino/integration/test/container/HiveContainer.java @@ -180,6 +180,12 @@ protected boolean checkContainerStatus(int retryLimit) { if (result.getExitCode() == 0) { return true; } + + LOG.warn( + "Failed to execute sql: {}, Std-out: {}, Std-error:{}", + createTableSQL, + result.getStdout(), + result.getStderr()); } catch (Exception e) { LOG.error("Failed to execute sql: {}", createTableSQL, e); }