From c0fc6cfc581b4582d3c29942b744f052991c316d Mon Sep 17 00:00:00 2001 From: Kevin Su Date: Tue, 20 Dec 2022 07:59:59 +0800 Subject: [PATCH] Databricks plugin (#3142) Signed-off-by: Kevin Su --- charts/flyte-core/README.md | 1 + .../templates/propeller/configmap.yaml | 5 + charts/flyte-core/values.yaml | 15 +++ .../plugin_setup/webapi/databricks.rst | 103 ++++++++++++++++++ rsts/deployment/plugin_setup/webapi/index.rst | 10 ++ 5 files changed, 134 insertions(+) create mode 100644 rsts/deployment/plugin_setup/webapi/databricks.rst diff --git a/charts/flyte-core/README.md b/charts/flyte-core/README.md index 3306f966d1..796f0d0867 100644 --- a/charts/flyte-core/README.md +++ b/charts/flyte-core/README.md @@ -107,6 +107,7 @@ helm install gateway bitnami/contour -n flyte | configmap.task_logs.plugins.logs.cloudwatch-enabled | bool | `false` | One option is to enable cloudwatch logging for EKS, update the region and log group accordingly | | configmap.task_resource_defaults | object | `{"task_resources":{"defaults":{"cpu":"100m","memory":"500Mi","storage":"500Mi"},"limits":{"cpu":2,"gpu":1,"memory":"1Gi","storage":"20Mi"}}}` | Task default resources configuration Refer to the full [structure](https://pkg.go.dev/github.com/lyft/flyteadmin@v0.3.37/pkg/runtime/interfaces#TaskResourceConfiguration). | | configmap.task_resource_defaults.task_resources | object | `{"defaults":{"cpu":"100m","memory":"500Mi","storage":"500Mi"},"limits":{"cpu":2,"gpu":1,"memory":"1Gi","storage":"20Mi"}}` | Task default resources parameters | +| databricks | object | `{"enabled":false,"plugin_config":{"plugins":{"databricks":{"databricksInstance":"dbc-a53b7a3c-614c","entrypointFile":"dbfs:///FileStore/tables/entrypoint.py"}}}}` | Optional: Databricks Plugin allows us to run the spark job on the Databricks platform. | | datacatalog.affinity | object | `{}` | affinity for Datacatalog deployment | | datacatalog.configPath | string | `"/etc/datacatalog/config/*.yaml"` | Default regex string for searching configuration files | | datacatalog.enabled | bool | `true` | | diff --git a/charts/flyte-core/templates/propeller/configmap.yaml b/charts/flyte-core/templates/propeller/configmap.yaml index 3ab46bcc28..21f45bff79 100644 --- a/charts/flyte-core/templates/propeller/configmap.yaml +++ b/charts/flyte-core/templates/propeller/configmap.yaml @@ -40,6 +40,11 @@ data: {{- with .Values.sparkoperator.plugin_config }} spark.yaml: | {{ tpl (toYaml .) $ | nindent 4 }} {{- end }} +{{- end }} +{{- if .Values.databricks.enabled }} +{{- with .Values.databricks.plugin_config }} + databricks.yaml: | {{ tpl (toYaml .) $ | nindent 4 }} +{{- end }} {{- end }} storage.yaml: | {{ tpl (include "storage" .) $ | nindent 4 }} cache.yaml: | diff --git a/charts/flyte-core/values.yaml b/charts/flyte-core/values.yaml index b2b6b083db..3da8dc55bc 100755 --- a/charts/flyte-core/values.yaml +++ b/charts/flyte-core/values.yaml @@ -821,3 +821,18 @@ sparkoperator: - spark.blacklist.enabled: "true" - spark.blacklist.timeout: "5m" - spark.task.maxfailures: "8" + + +# -------------------------------------------------------- +# Optional Plugins +# -------------------------------------------------------- + +# -- Optional: Databricks Plugin allows us to run the spark job on the Databricks platform. +databricks: + enabled: false + plugin_config: + plugins: + databricks: + entrypointFile: dbfs:///FileStore/tables/entrypoint.py + # Databricks account + databricksInstance: dbc-a53b7a3c-614c diff --git a/rsts/deployment/plugin_setup/webapi/databricks.rst b/rsts/deployment/plugin_setup/webapi/databricks.rst new file mode 100644 index 0000000000..41b96a837e --- /dev/null +++ b/rsts/deployment/plugin_setup/webapi/databricks.rst @@ -0,0 +1,103 @@ +.. _deployment-plugin-setup-webapi-databricks: + +Databricks Plugin Setup +----------------------- + +This guide gives an overview of how to set up Databricks in your Flyte deployment. + +1. Add Flyte chart repo to Helm + +.. code-block:: + + helm repo add flyteorg https://flyteorg.github.io/flyte + + +2. Setup the cluster + +.. tabbed:: Sandbox + + * Start the sandbox cluster + + .. code-block:: bash + + flytectl sandbox start + + * Generate Flytectl sandbox config + + .. code-block:: bash + + flytectl config init + +.. tabbed:: AWS/GCP + + * Make sure you have up and running flyte cluster in `AWS `__ / `GCP `__ + * Make sure you have correct kubeconfig and selected the correct kubernetes context + * make sure you have the correct flytectl config at ~/.flyte/config.yaml + +3. Upload an `entrypoint.py `__ to dbfs or s3. Spark driver node run this file to override the default command in the dbx job. + +4. Create a file named ``values-override.yaml`` and add the following config to it: + +.. code-block:: yaml + + configmap: + enabled_plugins: + # -- Tasks specific configuration [structure](https://pkg.go.dev/github.com/flyteorg/flytepropeller/pkg/controller/nodes/task/config#GetConfig) + tasks: + # -- Plugins configuration, [structure](https://pkg.go.dev/github.com/flyteorg/flytepropeller/pkg/controller/nodes/task/config#TaskPluginConfig) + task-plugins: + # -- [Enabled Plugins](https://pkg.go.dev/github.com/flyteorg/flyteplugins/go/tasks/config#Config). Enable sagemaker*, athena if you install the backend + # plugins + enabled-plugins: + - container + - sidecar + - k8s-array + - databricks + default-for-task-types: + container: container + sidecar: sidecar + container_array: k8s-array + spark: databricks + databricks: + enabled: True + plugin_config: + plugins: + databricks: + entrypointFile: dbfs:///FileStore/tables/entrypoint.py + databricksInstance: dbc-a53b7a3c-614c + +5. Create a Databricks account and follow the docs for creating an Access token. + +6. Create a `Instance Profile `_ for the Spark cluster, it allows the spark job to access your data in the s3 bucket. + +7. Add Databricks access token to FlytePropeller. + +.. note:: + Refer to the `access token `__ to understand setting up the Databricks access token. + +.. code-block:: bash + + kubectl edit secret -n flyte flyte-secret-auth + +The configuration will look as follows: + +.. code-block:: yaml + + apiVersion: v1 + data: + FLYTE_DATABRICKS_API_TOKEN: + client_secret: Zm9vYmFy + kind: Secret + metadata: + annotations: + meta.helm.sh/release-name: flyte + meta.helm.sh/release-namespace: flyte + ... + +Replace ```` with your access token. + +8. Upgrade the Flyte Helm release. + +.. code-block:: bash + + helm upgrade -n flyte -f https://raw.githubusercontent.com/flyteorg/flyte/master/charts/flyte-core/values-sandbox.yaml -f values-override.yaml flyteorg/flyte-core diff --git a/rsts/deployment/plugin_setup/webapi/index.rst b/rsts/deployment/plugin_setup/webapi/index.rst index 7ade2be7b8..be7ef945c3 100644 --- a/rsts/deployment/plugin_setup/webapi/index.rst +++ b/rsts/deployment/plugin_setup/webapi/index.rst @@ -18,6 +18,15 @@ Web API Plugin Setup ^^^^^^^^^^^^ Guide to setting up the Snowflake Plugin. + --- + + .. link-button:: deployment-plugin-setup-webapi-databricks + :type: ref + :text: Databricks Plugin + :classes: btn-block stretched-link + ^^^^^^^^^^^^ + Guide to setting up the Databricks Plugin. + .. toctree:: :maxdepth: 1 @@ -25,5 +34,6 @@ Web API Plugin Setup :hidden: snowflake + databricks