Skip to content

Commit

Permalink
Add Flink-Hive use case in Jupyter notebook (apache#91)
Browse files Browse the repository at this point in the history
This commit adds a Flink-Hive use case to the Jupyter notebook
  • Loading branch information
TungYuChiang committed Nov 5, 2024
1 parent 440ffd2 commit df2fc17
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 64 deletions.
4 changes: 4 additions & 0 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,10 @@ services:
entrypoint: /bin/bash /tmp/gravitino/init.sh
environment:
- HADOOP_CLASSPATH=/tmp/gravitino/packages/hadoop-2.7.3/etc/hadoop:/tmp/gravitino/packages/hadoop-2.7.3/share/hadoop/common/lib/*:/tmp/gravitino/packages/hadoop-2.7.3/share/hadoop/common/*:/tmp/gravitino/packages/hadoop-2.7.3/share/hadoop/hdfs:/tmp/gravitino/packages/hadoop-2.7.3/share/hadoop/hdfs/lib/*:/tmp/gravitino/packages/hadoop-2.7.3/share/hadoop/hdfs/*:/tmp/gravitino/packages/hadoop-2.7.3/share/hadoop/yarn/lib/*:/tmp/gravitino/packages/hadoop-2.7.3/share/hadoop/yarn/*:/tmp/gravitino/packages/hadoop-2.7.3/share/hadoop/mapreduce/lib/*:/tmp/gravitino/packages/hadoop-2.7.3/share/hadoop/mapreduce/*:/tmp/gravitino/packages/contrib/capacity-scheduler/*.jar
- NB_USER=my-username
- GRANT_SUDO=yes
- CHOWN_HOME=yes
user: root
depends_on:
hive :
condition: service_healthy
Expand Down
104 changes: 43 additions & 61 deletions init/jupyter/gravitino-flink-hive-example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,40 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "b13b0b0b-6aca-4cbb-8771-a10f4c79a017",
"metadata": {},
"outputs": [],
"source": [
"!sudo apt-get update && sudo apt-get install -y openjdk-17-jdk"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6ef94f47-5718-4c35-82ce-90bd2c00927a",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-17-openjdk-arm64\"\n",
"os.environ[\"PATH\"] = f\"{os.environ['JAVA_HOME']}/bin:\" + os.environ[\"PATH\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a3c975dc-afa1-4057-9990-6d4b8c06749b",
"metadata": {},
"outputs": [],
"source": [
"!python3 -m pip install apache-flink"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f0cf8f3e-14f9-4209-8103-a3a0c598a21a",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -54,21 +87,10 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "f1037708-56a3-4b7a-80a1-b1015e928a03",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<pyflink.table.table_result.TableResult at 0xffff67efd290>"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"table_env.use_catalog(\"catalog_hive\")\n",
"table_env.execute_sql(\"CREATE DATABASE IF NOT EXISTS Reading_System\")\n",
Expand All @@ -85,20 +107,10 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"id": "0996b4d2-35dc-456c-9b08-52b8beb8fe86",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<Row('default')>\n",
"<Row('reading_system')>\n",
"<Row('sales')>\n"
]
}
],
"outputs": [],
"source": [
"result = table_env.execute_sql(\"SHOW DATABASES\")\n",
"with result.collect() as results:\n",
Expand All @@ -116,7 +128,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"id": "69eea5be-73c9-489a-b294-74bdea0f6bf7",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -148,21 +160,10 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"id": "228d0ca3-8ad2-4b53-ae99-c430713aeb02",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<pyflink.table.table_result.TableResult at 0xffff67f14a50>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"table_env.execute_sql(\"\"\"\n",
" INSERT INTO books VALUES \n",
Expand All @@ -181,29 +182,10 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"id": "5232e493-a699-4d50-b489-4de4652bf344",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<Row(4, 'The Great Gatsby', 'F. Scott Fitzgerald', '1925-04-10')>\n",
"<Row(5, 'Moby Dick', 'Herman Melville', '1851-11-14')>\n",
"<Row(6, 'Pride and Prejudice', 'Jane Austen', '1813-01-28')>\n",
"<Row(7, 'The Catcher in the Rye', 'J.D. Salinger', '1951-07-16')>\n",
"<Row(6, 'Pride and Prejudice', 'Jane Austen', '1813-01-28')>\n",
"<Row(7, 'The Catcher in the Rye', 'J.D. Salinger', '1951-07-16')>\n",
"<Row(6, 'Pride and Prejudice', 'Jane Austen', '1813-01-28')>\n",
"<Row(7, 'The Catcher in the Rye', 'J.D. Salinger', '1951-07-16')>\n",
"<Row(4, 'The Great Gatsby', 'F. Scott Fitzgerald', '1925-04-10')>\n",
"<Row(5, 'Moby Dick', 'Herman Melville', '1851-11-14')>\n",
"<Row(4, 'The Great Gatsby', 'F. Scott Fitzgerald', '1925-04-10')>\n",
"<Row(5, 'Moby Dick', 'Herman Melville', '1851-11-14')>\n"
]
}
],
"outputs": [],
"source": [
"result = table_env.execute_sql(\"SELECT * FROM books\")\n",
"with result.collect() as results:\n",
Expand Down
19 changes: 16 additions & 3 deletions init/jupyter/jupyter-dependency.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,25 @@ fi

FLINK_HIVE_CONNECTOR_JAR="https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-hive-2.3.10_2.12/1.20.0/flink-sql-connector-hive-2.3.10_2.12-1.20.0.jar"
FLINK_HIVE_CONNECTOR_MD5="${FLINK_HIVE_CONNECTOR_JAR}.md5"
download_and_verify "${FLINK_HIVE_CONNECTOR_JAR}" "${FLINK_HIVE_CONNECTOR_MD5}" "${script_dir}/packages"
download_and_verify "${FLINK_HIVE_CONNECTOR_JAR}" "${FLINK_HIVE_CONNECTOR_MD5}" "${jupyter_dir}"

GRAVITINO_FLINK_JAR="https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-flink-1.18_2.12/0.6.1-incubating/gravitino-flink-1.18_2.12-0.6.1-incubating.jar"
GRAVITINO_FLINK_MD5="${GRAVITINO_FLINK_JAR}.md5"
download_and_verify "${GRAVITINO_FLINK_JAR}" "${GRAVITINO_FLINK_MD5}" "${script_dir}/packages"
download_and_verify "${GRAVITINO_FLINK_JAR}" "${GRAVITINO_FLINK_MD5}" "${jupyter_dir}"

GRAVITINO_FLINK_CONNECTOR_RUNTIME_JAR="https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-flink-connector-runtime-1.18_2.12/0.6.1-incubating/gravitino-flink-connector-runtime-1.18_2.12-0.6.1-incubating.jar"
GRAVITINO_FLINK_CONNECTOR_RUNTIME_MD5="${GRAVITINO_FLINK_CONNECTOR_RUNTIME_JAR}.md5"
download_and_verify "${GRAVITINO_FLINK_CONNECTOR_RUNTIME_JAR}" "${GRAVITINO_FLINK_CONNECTOR_RUNTIME_MD5}" "${script_dir}/packages"
download_and_verify "${GRAVITINO_FLINK_CONNECTOR_RUNTIME_JAR}" "${GRAVITINO_FLINK_CONNECTOR_RUNTIME_MD5}" "${jupyter_dir}"


HADOOP_VERSION="2.7.3"
HADOOP_URL="https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz"
echo "Downloading Hadoop ${HADOOP_VERSION}..."

curl -fLo "${jupyter_dir}/packages/hadoop-${HADOOP_VERSION}.tar.gz" "$HADOOP_URL" || { echo "Failed to download Hadoop ${HADOOP_VERSION}"; exit 1; }
echo "Extracting Hadoop ${HADOOP_VERSION}..."

tar -xzf "${jupyter_dir}/packages/hadoop-${HADOOP_VERSION}.tar.gz" -C "${jupyter_dir}/packages"
rm "${jupyter_dir}/packages/hadoop-${HADOOP_VERSION}.tar.gz"

echo "Hadoop ${HADOOP_VERSION} downloaded and extracted to ${jupyter_dir}/packages/hadoop-${HADOOP_VERSION}"

0 comments on commit df2fc17

Please sign in to comment.