diff --git a/installer/charts/jupyterlab/start-notebook.sh b/installer/charts/jupyterlab/start-notebook.sh index 0bfc054..2733e69 100644 --- a/installer/charts/jupyterlab/start-notebook.sh +++ b/installer/charts/jupyterlab/start-notebook.sh @@ -12,6 +12,6 @@ jupyter kernelspec uninstall sparkrkernel -y jupyter kernelspec uninstall python3 -y mkdir -p /home/jovyan/.sparkmagic/ cp /tmp/sparkmagic.json /home/jovyan/.sparkmagic/config.json -cp /tmp/quickstart.ipynb /home/jovyan/work/quickstart.ipynb +cp /tmp/*.ipynb /home/jovyan/work/ exec /usr/local/bin/start-notebook.py "$@" \ No newline at end of file diff --git a/installer/charts/jupyterlab/templates/deployment.yaml b/installer/charts/jupyterlab/templates/deployment.yaml index d389b08..f05d149 100644 --- a/installer/charts/jupyterlab/templates/deployment.yaml +++ b/installer/charts/jupyterlab/templates/deployment.yaml @@ -53,6 +53,9 @@ spec: - name: secret mountPath: /tmp/quickstart.ipynb subPath: quickstart + - name: secret + mountPath: /tmp/tpcds.ipynb + subPath: tpcds - name: secret mountPath: /home/jovyan/.jupyter/jupyter_notebook_config.py subPath: jupyter_notebook_config diff --git a/installer/charts/jupyterlab/templates/secret.yaml b/installer/charts/jupyterlab/templates/secret.yaml index bb665e1..6400cf1 100644 --- a/installer/charts/jupyterlab/templates/secret.yaml +++ b/installer/charts/jupyterlab/templates/secret.yaml @@ -85,6 +85,228 @@ stringData: "nbformat_minor": 5 } + tpcds: | + { + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "df378969-32dd-4d27-bbeb-89d532f5fb8b", + "metadata": {}, + "outputs": [], + "source": [ + "%%configure\n", + "{\n", + " \"conf\":{\n", + " \"spark.kubernetes.executor.podNamePrefix\": \"tpcds-worker\",\n", + " \"spark.kubernetes.container.image.pullPolicy\": \"IfNotPresent\",\n", + " \"spark.driver.memory\" : \"2G\",\n", + " \"spark.executor.memory\": \"2G\"\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbce441f-1825-4b07-9b1a-0a2c74bac6b8", + "metadata": {}, + "outputs": [], + "source": [ + "import subprocess, io\n", + "command=\"pip install tpcds_pyspark sparkmeasure plotly kaleido pandas --user\"\n", + "proc = subprocess.Popen(command.split(), stdout=subprocess.PIPE)\n", + "for line in io.TextIOWrapper(proc.stdout, encoding=\"utf-8\"):\n", + " print(line.rstrip())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a483c3fc-7839-4e05-aa6e-a493265b2276", + "metadata": {}, + "outputs": [], + "source": [ + "# Download TPC-DS data\n", + "import subprocess, io\n", + "command=\"[ ! -d '/opt/spark/work-dir/tpcds_10' ] && wget -O /opt/spark/work-dir/tpcds_10.zip https://sparkdltrigger.web.cern.ch/sparkdltrigger/TPCDS/tpcds_10.zip && unzip /opt/spark/work-dir/tpcds_10.zip\"\n", + "proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)\n", + "for line in io.TextIOWrapper(proc.stdout, encoding=\"utf-8\"):\n", + " print(line.rstrip())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "923963fd-5166-4232-bc43-b9ca35df5080", + "metadata": {}, + "outputs": [], + "source": [ + "from tpcds_pyspark import TPCDS\n", + "\n", + "# Basic configuration, use tpcds 10G scale and just run 2 queries to get started\n", + "tpcds = TPCDS(data_path=\"/opt/spark/work-dir/tpcds_10\", queries=['q1', 'q2'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25272237-dd08-4321-8ef8-f6d3a6f4d3d3", + "metadata": {}, + "outputs": [], + "source": [ + "# Map TPC-DS tables into temporary views\n", + "tpcds.map_tables() " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0bbfd33-fbef-4495-9f8d-1a9d44a33039", + "metadata": {}, + "outputs": [], + "source": [ + "# Run the TPC-DS workload\n", + "results = tpcds.run_TPCDS()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81f636cb-bad0-4007-98ad-576213f2f5cb", + "metadata": {}, + "outputs": [], + "source": [ + "# Print the test output and reports\n", + "tpcds.print_test_results()" + ] + }, + { + "cell_type": "markdown", + "execution_count": null, + "id": "b5ce4efc-a081-4e69-b989-e6bcb10e80b1", + "metadata": {}, + "outputs": [], + "source": [ + "## Test data analysis\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05f28ad5-c41d-4420-a78d-b76032efc0e2", + "metadata": {}, + "outputs": [], + "source": [ + "tpcds.aggregated_results_pdf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6cadc03c-00a2-4e28-8605-425b6dbc027f", + "metadata": {}, + "outputs": [], + "source": [ + "tpcds.grouped_results_pdf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11c42044-3479-49f3-89d1-581ce27a6e5d", + "metadata": {}, + "outputs": [], + "source": [ + "tpcds.grouped_results_pdf.columns" + ] + }, + { + "cell_type": "markdown", + "execution_count": null, + "id": "cb27bd04-1e7b-4d2e-8926-3e06e10a21c4", + "metadata": {}, + "outputs": [], + "source": [ + "## Example of data plotting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e088c410-2903-4aab-808c-90506cf45059", + "metadata": {}, + "outputs": [], + "source": [ + "import plotly.express as px" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fff0ded-fb22-401d-a445-5b0248001bbc", + "metadata": {}, + "outputs": [], + "source": [ + "%%spark -o image_out\n", + "import io, base64, sys\n", + "from pyspark.sql.types import StructType,StructField, StringType\n", + "\n", + "iobytes2 = io.BytesIO()\n", + "fig = px.bar(tpcds.grouped_results_pdf.reset_index(), x='query', y=['executorRunTime', 'executorCpuTime'], \n", + " title='Executor Run Time and CPU Time per Query', barmode='group',\n", + " labels={'value': 'Time (ms)', 'variable': 'Metric'}) # Customizing y-axis label and legend title)\n", + "\n", + "fig_image_bytes = fig.to_image(format=\"jpg\")\n", + "\n", + "# Save the image to an io.BytesIO object\n", + "iobytes2 = io.BytesIO(fig_image_bytes)\n", + "\n", + "\n", + "iobytes2.seek(0)\n", + "my_base64_jpgData2 = base64.b64encode(iobytes2.read()).decode()\n", + "schema = StructType([\n", + " StructField(\"content\", StringType(), nullable=False)\n", + "])\n", + "image_out = spark.createDataFrame([(my_base64_jpgData2,)], schema)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "799afa24-1ed5-4aa4-86bb-f92e340c0c4b", + "metadata": {}, + "outputs": [], + "source": [ + "%%local\n", + "import base64\n", + "from IPython import display\n", + "display.Image(base64.b64decode(image_out['content'][0]))\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "PySpark", + "language": "python", + "name": "pysparkkernel" + }, + "language_info": { + "codemirror_mode": { + "name": "python", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "pyspark", + "pygments_lexer": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 + } + jupyter_notebook_config: | c = get_config() c.NotebookApp.password = '{{ .Values.jupyterPassword}}' \ No newline at end of file diff --git a/installer/charts/lighter/templates/configmap.yaml b/installer/charts/lighter/templates/configmap.yaml new file mode 100644 index 0000000..e330b8c --- /dev/null +++ b/installer/charts/lighter/templates/configmap.yaml @@ -0,0 +1,44 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "lighter.fullname" . }}-pod-template + namespace: {{ .Release.Namespace }} + labels: + {{- include "lighter.labels" $ | trim | nindent 4 }} +data: + driver_pod_template.yaml: | + apiVersion: v1 + kind: Pod + spec: + tolerations: + - key: dedicated + operator: Equal + value: spark-driver + effect: NoSchedule + containers: + - securityContext: + allowPrivilegeEscalation: false + fsGroup: 185 + runAsGroup: 185 + runAsNonRoot: true + runAsUser: 185 + fsGroupChangePolicy: OnRootMismatch + + + executor_pod_template.yaml: | + apiVersion: v1 + kind: Pod + spec: + tolerations: + - key: dedicated + operator: Equal + value: spark + effect: NoSchedule + containers: + - securityContext: + allowPrivilegeEscalation: false + fsGroup: 185 + runAsGroup: 185 + runAsNonRoot: true + runAsUser: 185 + fsGroupChangePolicy: OnRootMismatch diff --git a/installer/charts/lighter/templates/deployment.yaml b/installer/charts/lighter/templates/deployment.yaml index fd59818..829b285 100644 --- a/installer/charts/lighter/templates/deployment.yaml +++ b/installer/charts/lighter/templates/deployment.yaml @@ -121,10 +121,6 @@ spec: - name: LIGHTER_BATCH_DEFAULT_CONF value: '{ "spark.log.level": "error", - "spark.kubernetes.driver.runAsUser": "185", - "spark.kubernetes.executor.runAsUser": "185", - "spark.kubernetes.driver.fsGroup": "185", - "spark.kubernetes.executor.fsGroup": "185", "spark.hadoop.hive.metastore.uris": "thrift://{{ if and .Values.global ( .Values.global.isTopLevel) }}{{ .Release.Name }}-hive-metastore{{ else }}{{ required "hive metastore installed full service name required" $hiveMetastoreServiceName }}{{ end }}.{{ if and .Values.global ( .Values.global.isTopLevel) }}{{ .Release.Namespace }}{{ else }}{{ required "hive metastore installed namespace required" .Values.hiveMetastore.namespace }}{{ end }}.svc.cluster.local:9083", "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog", @@ -161,6 +157,11 @@ spec: "spark.kubernetes.driverEnv.PYTHONPATH": "{{ .Values.python.user.base }}", "spark.executorEnv.PYTHONPATH": "{{ .Values.python.user.base }}", + "spark.kubernetes.driver.volumes.persistentVolumeClaim.{{ include "lighter.fullname" . }}-storage.options.claimName": "{{ include "lighter.fullname" . }}-storage", + "spark.kubernetes.driver.volumes.persistentVolumeClaim.{{ include "lighter.fullname" . }}-storage.mount.path": "/opt/spark/work-dir/", + "spark.kubernetes.executor.volumes.persistentVolumeClaim.{{ include "lighter.fullname" . }}-storage.options.claimName": "{{ include "lighter.fullname" . }}-storage", + "spark.kubernetes.executor.volumes.persistentVolumeClaim.{{ include "lighter.fullname" . }}-storage.mount.path": "/opt/spark/work-dir/", + {{ if .Values.spark.history.eventLog.usePVC }} "spark.kubernetes.driver.volumes.persistentVolumeClaim.{{ $sparkHistoryServerPVCName }}.options.claimName": "{{ $sparkHistoryServerPVCName }}", "spark.kubernetes.driver.volumes.persistentVolumeClaim.{{ $sparkHistoryServerPVCName }}.mount.path": "{{ .Values.spark.history.eventLog.dir }}", @@ -196,10 +197,6 @@ spec: - name: LIGHTER_SESSION_DEFAULT_CONF value: '{ "spark.log.level": "error", - "spark.kubernetes.driver.runAsUser": "185", - "spark.kubernetes.executor.runAsUser": "185", - "spark.kubernetes.driver.fsGroup": "185", - "spark.kubernetes.executor.fsGroup": "185", "spark.hadoop.hive.metastore.uris": "thrift://{{ if and .Values.global ( .Values.global.isTopLevel) }}{{ .Release.Name }}-hive-metastore{{ else }}{{ required "hive metastore installed full service name required" $hiveMetastoreServiceName }}{{ end }}.{{ if and .Values.global ( .Values.global.isTopLevel) }}{{ .Release.Namespace }}{{ else }}{{ required "hive metastore installed namespace required" .Values.hiveMetastore.namespace }}{{ end }}.svc.cluster.local:9083", "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog", @@ -236,6 +233,12 @@ spec: "spark.kubernetes.driverEnv.PYTHONPATH": "{{ .Values.python.user.base }}", "spark.executorEnv.PYTHONPATH": "{{ .Values.python.user.base }}", + "spark.kubernetes.driver.volumes.persistentVolumeClaim.{{ include "lighter.fullname" . }}-storage.options.claimName": "{{ include "lighter.fullname" . }}-storage", + "spark.kubernetes.driver.volumes.persistentVolumeClaim.{{ include "lighter.fullname" . }}-storage.mount.path": "/opt/spark/work-dir/", + "spark.kubernetes.executor.volumes.persistentVolumeClaim.{{ include "lighter.fullname" . }}-storage.options.claimName": "{{ include "lighter.fullname" . }}-storage", + "spark.kubernetes.executor.volumes.persistentVolumeClaim.{{ include "lighter.fullname" . }}-storage.mount.path": "/opt/spark/work-dir/", + + {{ if .Values.spark.history.eventLog.usePVC }} "spark.kubernetes.driver.volumes.persistentVolumeClaim.{{ $sparkHistoryServerPVCName }}.options.claimName": "{{ $sparkHistoryServerPVCName }}", "spark.kubernetes.driver.volumes.persistentVolumeClaim.{{ $sparkHistoryServerPVCName }}.mount.path": "{{ .Values.spark.history.eventLog.dir }}", @@ -272,5 +275,12 @@ spec: value: "{{ .Values.spark.history.url }}" - name: LIGHTER_KUBERNETES_SERVICE_ACCOUNT value: "{{ include "lighter.fullname" . }}-sa" + volumeMounts: + - name: pod-template + mountPath: /home/app/k8s/ - serviceAccountName: "{{ include "lighter.fullname" . }}-sa" \ No newline at end of file + serviceAccountName: "{{ include "lighter.fullname" . }}-sa" + volumes: + - name: pod-template + configMap: + name: {{ include "lighter.fullname" . }}-pod-template \ No newline at end of file diff --git a/installer/charts/lighter/templates/pvc.yaml b/installer/charts/lighter/templates/pvc.yaml new file mode 100644 index 0000000..9d3b8ca --- /dev/null +++ b/installer/charts/lighter/templates/pvc.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "lighter.fullname" . }}-storage + namespace: {{ .Release.Namespace }} + labels: + {{- include "lighter.labels" $ | trim | nindent 4 }} + annotations: + helm.sh/resource-policy: "keep" + +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: "50Gi" \ No newline at end of file diff --git a/submit_job_example/submit_pyspark_job.py b/submit_job_example/submit_pyspark_job.py index f674804..5fcc522 100644 --- a/submit_job_example/submit_pyspark_job.py +++ b/submit_job_example/submit_pyspark_job.py @@ -15,7 +15,7 @@ "driverCores": 1, "driverMemory": "1G", "conf": { - "spark.kubernetes.container.image": 'ghcr.io/aisingapore/kapitan-spark/spark:0.0.4-spark3.5.1', + "spark.kubernetes.container.image": 'ghcr.io/aisingapore/kapitan-spark/spark:0.0.5-spark3.5.1', "spark.kubernetes.container.image.pullPolicy": 'Always' } }