From f62407ba9d2233cceaac37c77a53b5fed3cb4726 Mon Sep 17 00:00:00 2001 From: Stuart Lynn <50170698+sllynn@users.noreply.github.com> Date: Tue, 24 Sep 2024 10:23:01 +0100 Subject: [PATCH 01/21] Fix/quickstart-MosaicAnalyzer (#565) * updated MosaicAnalyser section of quickstart notebook * A few other tweaks * Typo --- .../Quickstart/QuickstartNotebook.ipynb | 458 +++++++++--------- 1 file changed, 233 insertions(+), 225 deletions(-) diff --git a/notebooks/examples/python/Quickstart/QuickstartNotebook.ipynb b/notebooks/examples/python/Quickstart/QuickstartNotebook.ipynb index d45e3b986..f2614f071 100644 --- a/notebooks/examples/python/Quickstart/QuickstartNotebook.ipynb +++ b/notebooks/examples/python/Quickstart/QuickstartNotebook.ipynb @@ -12,6 +12,7 @@ } }, "source": [ + "%md\n", "# Mosaic Quickstart\n", "\n", "> Perform a point-in-polygon spatial join between NYC Taxi trips and zones. __Note: this does not get into performance tweaks that are available for scaled joins.__\n", @@ -29,13 +30,10 @@ " * Already installed with Mosaic, use `%%mosaic_kepler` magic [[Mosaic Docs](https://databrickslabs.github.io/mosaic/usage/kepler.html)]\n", " * Import with `from keplergl import KeplerGl` to use directly\n", "\n", - "If you have trouble with Volume access:\n", - "\n", - "* For Mosaic 0.3 series (< DBR 13) - you can copy resources to DBFS as a workaround\n", - "* For Mosaic 0.4 series (DBR 13.3 LTS) - you will need to either copy resources to DBFS or setup for Unity Catalog + Shared Access which will involve your workspace admin. Instructions, as updated, will be [here](https://databrickslabs.github.io/mosaic/usage/install-gdal.html).\n", + "If you have trouble reading source datasets from a Unity Catalog Volume, the easiest workaround is to copy resources to an accessible location DBFS. Note: 'Shared' access mode clusters are not supported at all.\n", "\n", "--- \n", - " __Last Update__ 28 NOV 2023 [Mosaic 0.3.12]" + " __Last Update__ 10 JUN 2024 [Mosaic 0.4.2]" ] }, { @@ -57,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -72,22 +70,21 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", "output_type": "stream", "text": [ - "Python interpreter will be restarted.\nPython interpreter will be restarted.\n" + "Python interpreter will be restarted.\n", + "Python interpreter will be restarted.\n" ] } ], "source": [ - "%pip install \"databricks-mosaic<0.4,>=0.3\" --quiet # <- Mosaic 0.3 series\n", - "# %pip install \"databricks-mosaic<0.5,>=0.4\" --quiet # <- Mosaic 0.4 series (as available)" + "%pip install \"databricks-mosaic<0.5,>=0.4\" --quiet # <- Mosaic 0.4 series" ] }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -109,8 +106,7 @@ "spark.conf.set(\"spark.sql.shuffle.partitions\", 1_024) # <-- default is 200\n", "\n", "# -- import databricks + spark functions\n", - "from pyspark.sql import functions as F\n", - "from pyspark.sql.functions import col, udf\n", + "from pyspark.sql.functions import col, udf, lit, to_json, explode, array\n", "from pyspark.sql.types import *\n", "\n", "# -- setup mosaic\n", @@ -145,7 +141,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -160,7 +156,6 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", "output_type": "stream", "text": [ @@ -194,7 +189,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -209,7 +204,6 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", "output_type": "stream", "text": [ @@ -228,7 +222,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -243,7 +237,6 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", "output_type": "stream", "text": [ @@ -251,7 +244,6 @@ ] }, { - "output_type": "display_data", "data": { "text/html": [ "