From df2fc170cdea780a5ab49cda366ea1d8564affcc Mon Sep 17 00:00:00 2001 From: TungYuChiang Date: Mon, 4 Nov 2024 22:51:19 +0800 Subject: [PATCH] Add Flink-Hive use case in Jupyter notebook (#91) This commit adds a Flink-Hive use case to the Jupyter notebook --- docker-compose.yaml | 4 + .../gravitino-flink-hive-example.ipynb | 104 ++++++++---------- init/jupyter/jupyter-dependency.sh | 19 +++- 3 files changed, 63 insertions(+), 64 deletions(-) diff --git a/docker-compose.yaml b/docker-compose.yaml index 59e9824b..7001b0fd 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -147,6 +147,10 @@ services: entrypoint: /bin/bash /tmp/gravitino/init.sh environment: - HADOOP_CLASSPATH=/tmp/gravitino/packages/hadoop-2.7.3/etc/hadoop:/tmp/gravitino/packages/hadoop-2.7.3/share/hadoop/common/lib/*:/tmp/gravitino/packages/hadoop-2.7.3/share/hadoop/common/*:/tmp/gravitino/packages/hadoop-2.7.3/share/hadoop/hdfs:/tmp/gravitino/packages/hadoop-2.7.3/share/hadoop/hdfs/lib/*:/tmp/gravitino/packages/hadoop-2.7.3/share/hadoop/hdfs/*:/tmp/gravitino/packages/hadoop-2.7.3/share/hadoop/yarn/lib/*:/tmp/gravitino/packages/hadoop-2.7.3/share/hadoop/yarn/*:/tmp/gravitino/packages/hadoop-2.7.3/share/hadoop/mapreduce/lib/*:/tmp/gravitino/packages/hadoop-2.7.3/share/hadoop/mapreduce/*:/tmp/gravitino/packages/contrib/capacity-scheduler/*.jar + - NB_USER=my-username + - GRANT_SUDO=yes + - CHOWN_HOME=yes + user: root depends_on: hive : condition: service_healthy diff --git a/init/jupyter/gravitino-flink-hive-example.ipynb b/init/jupyter/gravitino-flink-hive-example.ipynb index ccbd0fdf..a5cc4571 100644 --- a/init/jupyter/gravitino-flink-hive-example.ipynb +++ b/init/jupyter/gravitino-flink-hive-example.ipynb @@ -18,7 +18,40 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, + "id": "b13b0b0b-6aca-4cbb-8771-a10f4c79a017", + "metadata": {}, + "outputs": [], + "source": [ + "!sudo apt-get update && sudo apt-get install -y openjdk-17-jdk" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ef94f47-5718-4c35-82ce-90bd2c00927a", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-17-openjdk-arm64\"\n", + "os.environ[\"PATH\"] = f\"{os.environ['JAVA_HOME']}/bin:\" + os.environ[\"PATH\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3c975dc-afa1-4057-9990-6d4b8c06749b", + "metadata": {}, + "outputs": [], + "source": [ + "!python3 -m pip install apache-flink" + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "f0cf8f3e-14f9-4209-8103-a3a0c598a21a", "metadata": {}, "outputs": [], @@ -54,21 +87,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "f1037708-56a3-4b7a-80a1-b1015e928a03", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "table_env.use_catalog(\"catalog_hive\")\n", "table_env.execute_sql(\"CREATE DATABASE IF NOT EXISTS Reading_System\")\n", @@ -85,20 +107,10 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "0996b4d2-35dc-456c-9b08-52b8beb8fe86", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "result = table_env.execute_sql(\"SHOW DATABASES\")\n", "with result.collect() as results:\n", @@ -116,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "69eea5be-73c9-489a-b294-74bdea0f6bf7", "metadata": {}, "outputs": [], @@ -148,21 +160,10 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "228d0ca3-8ad2-4b53-ae99-c430713aeb02", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "table_env.execute_sql(\"\"\"\n", " INSERT INTO books VALUES \n", @@ -181,29 +182,10 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "5232e493-a699-4d50-b489-4de4652bf344", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "result = table_env.execute_sql(\"SELECT * FROM books\")\n", "with result.collect() as results:\n", diff --git a/init/jupyter/jupyter-dependency.sh b/init/jupyter/jupyter-dependency.sh index 235ba3d6..2c5f5e0a 100755 --- a/init/jupyter/jupyter-dependency.sh +++ b/init/jupyter/jupyter-dependency.sh @@ -34,12 +34,25 @@ fi FLINK_HIVE_CONNECTOR_JAR="https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-hive-2.3.10_2.12/1.20.0/flink-sql-connector-hive-2.3.10_2.12-1.20.0.jar" FLINK_HIVE_CONNECTOR_MD5="${FLINK_HIVE_CONNECTOR_JAR}.md5" -download_and_verify "${FLINK_HIVE_CONNECTOR_JAR}" "${FLINK_HIVE_CONNECTOR_MD5}" "${script_dir}/packages" +download_and_verify "${FLINK_HIVE_CONNECTOR_JAR}" "${FLINK_HIVE_CONNECTOR_MD5}" "${jupyter_dir}" GRAVITINO_FLINK_JAR="https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-flink-1.18_2.12/0.6.1-incubating/gravitino-flink-1.18_2.12-0.6.1-incubating.jar" GRAVITINO_FLINK_MD5="${GRAVITINO_FLINK_JAR}.md5" -download_and_verify "${GRAVITINO_FLINK_JAR}" "${GRAVITINO_FLINK_MD5}" "${script_dir}/packages" +download_and_verify "${GRAVITINO_FLINK_JAR}" "${GRAVITINO_FLINK_MD5}" "${jupyter_dir}" GRAVITINO_FLINK_CONNECTOR_RUNTIME_JAR="https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-flink-connector-runtime-1.18_2.12/0.6.1-incubating/gravitino-flink-connector-runtime-1.18_2.12-0.6.1-incubating.jar" GRAVITINO_FLINK_CONNECTOR_RUNTIME_MD5="${GRAVITINO_FLINK_CONNECTOR_RUNTIME_JAR}.md5" -download_and_verify "${GRAVITINO_FLINK_CONNECTOR_RUNTIME_JAR}" "${GRAVITINO_FLINK_CONNECTOR_RUNTIME_MD5}" "${script_dir}/packages" \ No newline at end of file +download_and_verify "${GRAVITINO_FLINK_CONNECTOR_RUNTIME_JAR}" "${GRAVITINO_FLINK_CONNECTOR_RUNTIME_MD5}" "${jupyter_dir}" + + +HADOOP_VERSION="2.7.3" +HADOOP_URL="https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz" +echo "Downloading Hadoop ${HADOOP_VERSION}..." + +curl -fLo "${jupyter_dir}/packages/hadoop-${HADOOP_VERSION}.tar.gz" "$HADOOP_URL" || { echo "Failed to download Hadoop ${HADOOP_VERSION}"; exit 1; } +echo "Extracting Hadoop ${HADOOP_VERSION}..." + +tar -xzf "${jupyter_dir}/packages/hadoop-${HADOOP_VERSION}.tar.gz" -C "${jupyter_dir}/packages" +rm "${jupyter_dir}/packages/hadoop-${HADOOP_VERSION}.tar.gz" + +echo "Hadoop ${HADOOP_VERSION} downloaded and extracted to ${jupyter_dir}/packages/hadoop-${HADOOP_VERSION}"