diff --git a/deep-learning/src/main/python/synapse/ml/__init__.py b/deep-learning/src/main/python/synapse/ml/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py b/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py
new file mode 100644
index 0000000000..69eca7add1
--- /dev/null
+++ b/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py
@@ -0,0 +1,227 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from sentence_transformers import SentenceTransformer
+from pyspark.ml.functions import predict_batch_udf
+from pyspark.ml import Transformer
+from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params
+from pyspark.sql.types import (
+    ArrayType,
+    FloatType,
+)
+
+
+class HuggingFaceSentenceEmbedder(Transformer, HasInputCol, HasOutputCol):
+    """
+    Custom transformer that extends PySpark's Transformer class to
+    perform sentence embedding using a model with optional TensorRT acceleration.
+    """
+
+    NUM_OPT_ROWS = 100  # Constant for number of rows taken for model optimization
+
+    BATCH_SIZE_DEFAULT = 64
+
+    # Define additional parameters
+    runtime = Param(
+        Params._dummy(),
+        "runtime",
+        "Specifies the runtime environment: cpu, cuda, or tensorrt",
+    )
+    batchSize = Param(Params._dummy(), "batchSize", "Batch size for embeddings", int)
+    modelName = Param(Params._dummy(), "modelName", "Full Model Name parameter")
+
+    def __init__(
+        self,
+        inputCol=None,
+        outputCol=None,
+        runtime=None,
+        batchSize=None,
+        modelName=None,
+    ):
+        """
+        Initialize the HuggingFaceSentenceEmbedder with input/output columns and optional TRT flag.
+        """
+        super(HuggingFaceSentenceEmbedder, self).__init__()
+
+        # Determine the default runtime based on CUDA availability
+        default_runtime = "cuda" if torch.cuda.is_available() else "cpu"
+
+        # Override the provided runtime if CUDA is not available
+        effective_runtime = runtime if torch.cuda.is_available() else "cpu"
+
+        self._setDefault(
+            runtime=default_runtime,
+            batchSize=self.BATCH_SIZE_DEFAULT,
+        )
+        self._set(
+            inputCol=inputCol,
+            outputCol=outputCol,
+            runtime=effective_runtime,
+            batchSize=batchSize if batchSize is not None else self.BATCH_SIZE_DEFAULT,
+            modelName=modelName,
+        )
+        self.optData = None
+        self.model = None
+        # Placeholder for the DataFrame row count check
+        self.row_count = 0  # This should be set when the DataFrame is available
+
+    # Setter method for batchSize
+    def setBatchSize(self, value):
+        self._set(batchSize=value)
+        return self
+
+    # Getter method for batchSize
+    def getBatchSize(self):
+        return self.getOrDefault(self.batchSize)
+
+    # Sets the runtime environment for the model.
+    # Supported values: 'cpu', 'cuda', 'tensorrt'
+    def setRuntime(self, value):
+        """
+        Sets the runtime environment for the model.
+        Supported values: 'cpu', 'cuda', 'tensorrt'
+        """
+        # if value not in ["cpu", "cuda", "onnxrt", "tensorrt"]:
+        if value not in ["cpu", "cuda", "tensorrt"]:
+            raise ValueError(
+                "Invalid runtime specified. Choose from 'cpu', 'cuda', 'tensorrt'"
+            )
+        self.setOrDefault(self.runtime, value)
+
+    def getRuntime(self):
+        return self.getOrDefault(self.runtime)
+
+    # Setter method for modelName
+    def setModelName(self, value):
+        self._set(modelName=value)
+        return self
+
+    # Getter method for modelName
+    def getModelName(self):
+        return self.getOrDefault(self.modelName)
+
+    def setRowCount(self, row_count):
+        self.row_count = row_count
+        # Override the runtime if row count is less than 100 or CUDA is not available
+        if self.row_count < 100 or not torch.cuda.is_available():
+            self.set(self.runtime, "cpu")
+        return self
+
+    # Optimize the model using Model Navigator with TensorRT configuration.
+    def _optimize(self, model):
+        import tensorrt as trt
+        import model_navigator as nav
+
+        conf = nav.OptimizeConfig(
+            target_formats=(nav.Format.TENSORRT,),
+            runners=("TensorRT",),
+            optimization_profile=nav.OptimizationProfile(
+                max_batch_size=self.BATCH_SIZE_DEFAULT
+            ),
+            custom_configs=[
+                nav.TorchConfig(autocast=True),
+                nav.TorchScriptConfig(autocast=True),
+                nav.TensorRTConfig(
+                    precision=(nav.TensorRTPrecision.FP16,),
+                    onnx_parser_flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM.value],
+                ),
+            ],
+        )
+
+        def _get_dataloader():
+            input_data = self.optData
+            return [
+                (
+                    0,
+                    (
+                        input_data,
+                        {"show_progress_bar": False, "batch_size": self.getBatchSize()},
+                    ),
+                )
+            ]
+
+        nav.optimize(model.encode, dataloader=_get_dataloader(), config=conf)
+
+    def _predict_batch_fn(self):
+        """
+        Create and return a function for batch prediction.
+        """
+        runtime = self.getRuntime()
+        if self.model == None:
+            global model
+            modelName = self.getModelName()
+
+            model = SentenceTransformer(
+                modelName, device="cpu" if runtime == "cpu" else "cuda"
+            ).eval()
+
+            if runtime in ("tensorrt"):
+                import tensorrt as trt
+                import model_navigator as nav
+
+                # this forces navigator to use specific runtime
+                nav.inplace_config.strategy = nav.SelectedRuntimeStrategy(
+                    "trt-fp16", "TensorRT"
+                )
+
+                moduleName = modelName.split("/")[1]
+                model = nav.Module(model, name=moduleName, forward_func="forward")
+                try:
+                    nav.load_optimized()
+                except Exception:
+                    self._optimize(model)
+                    nav.load_optimized()
+
+            self.model = model
+
+        def predict(inputs):
+            """
+            Predict method to encode inputs using the model.
+            """
+            with torch.no_grad():
+                output = model.encode(
+                    inputs.tolist(), convert_to_tensor=False, show_progress_bar=False
+                )
+
+            return output
+
+        return predict
+
+    # Method to apply the transformation to the dataset
+    def _transform(self, dataset, spark):
+        """
+        Apply the transformation to the input dataset.
+        """
+        input_col = self.getInputCol()
+        output_col = self.getOutputCol()
+
+        size = dataset.count()
+        self.setRowCount(size)
+        if size >= self.NUM_OPT_ROWS:
+            df = dataset.take(self.NUM_OPT_ROWS)
+            self.optData = [row[input_col] for row in df]
+
+        encode = predict_batch_udf(
+            self._predict_batch_fn,
+            return_type=ArrayType(FloatType()),
+            batch_size=self.getBatchSize(),
+        )
+        return dataset.withColumn(output_col, encode(input_col))
+
+    def transform(self, dataset, spark=None):
+        """
+        Public method to transform the dataset.
+        """
+        return self._transform(dataset, spark)
diff --git a/deep-learning/src/main/python/synapse/ml/hf/__init__.py b/deep-learning/src/main/python/synapse/ml/hf/__init__.py
new file mode 100644
index 0000000000..e93fd6a140
--- /dev/null
+++ b/deep-learning/src/main/python/synapse/ml/hf/__init__.py
@@ -0,0 +1 @@
+from synapse.ml.hf.HuggingFaceSentenceEmbedder import *
diff --git a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py
new file mode 100644
index 0000000000..00208dae03
--- /dev/null
+++ b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py
@@ -0,0 +1,95 @@
+# Copyright (C) NVIDIA Corporation. All rights reserved.
+# Licensed under the Apache License, See LICENSE in project root for information.
+
+import os, json, subprocess, unittest
+from synapse.ml.hf import HuggingFaceSentenceEmbedder
+from synapse.ml.nn import KNN
+from synapse.ml.core.init_spark import *
+from pyspark.sql import DataFrame, SQLContext
+
+spark = init_spark()
+sc = SQLContext(spark.sparkContext)
+
+
+class HuggingFaceSentenceTransformerTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(HuggingFaceSentenceTransformerTest, self).__init__(*args, **kwargs)
+
+        self.miniLMSize = 384
+        self.e5Size = 1024
+
+        self.e5Transformer = HuggingFaceSentenceEmbedder(
+            modelName="intfloat/e5-large-v2",
+            inputCol="data",
+            outputCol="embeddings",
+            runtime="cpu",
+        )
+
+        self.miniLMTransformer = HuggingFaceSentenceEmbedder(
+            modelName="sentence-transformers/all-MiniLM-L6-v2",
+            inputCol="data",
+            outputCol="embeddings",
+            runtime="cpu",
+        )
+
+        # construction of test dataframe
+        self.sentenceDataFrame = sc.createDataFrame(
+            [
+                (1, "Happy"),
+                (2, "Good"),
+                (3, "Delicious"),
+                (4, "Like it"),
+                (5, "OK"),
+                (6, "Disgusting"),
+                (7, "Bad"),
+                (8, "Don't like it"),
+                (9, "Tastless"),
+                (10, "Poor quality"),
+            ],
+            ["id", "data"],
+        )
+        # self.sentenceDataFrame = (
+        #     init_spark()
+        #     .createDataFrame([(1, "desserts"), (2, "disgusting")], ["id", "data"])
+        #     .cache()
+        # )
+
+    def test_e5_Embedding(self):
+        self._assert_input(self.sentenceDataFrame)
+        transformed = self.e5Transformer.transform(self.sentenceDataFrame).cache()
+        self._assert_input(transformed)
+        self._assert_embedding_df_size(self.sentenceDataFrame, transformed)
+        self._assert_embedding_embedding_size(transformed, self.e5Size)
+
+    def test_miniLM_Embedding(self):
+        self._assert_input(self.sentenceDataFrame)
+        transformed = self.miniLMTransformer.transform(self.sentenceDataFrame).cache()
+        self._assert_input(transformed)
+        self._assert_embedding_df_size(self.sentenceDataFrame, transformed)
+        self._assert_embedding_embedding_size(transformed, self.miniLMSize)
+
+    def _assert_input(self, input):
+        # Use assert to check if the result is a DataFrame
+        testDf = self.sentenceDataFrame
+        assert isinstance(testDf, DataFrame), "The input is not a DataFrame."
+
+    def _assert_embedding_embedding_size(self, transformed, expected_size):
+        # Debugging to check the type
+        collected_data = transformed.collect()
+        for row in collected_data:
+            embeddings_array = row["embeddings"]
+            size = len(embeddings_array)
+            assert (
+                size == expected_size
+            ), f"Embedding size mismatch: expected {expected_size}, got {size}"
+
+    def _assert_embedding_df_size(self, dataframe, transformed):
+        num_rows = transformed.count()
+        expected_num_rows = dataframe.count()
+        assert (
+            num_rows == expected_num_rows
+        ), f"DataFrame size mismatch after transformation: expected {expected_num_rows}, got {num_rows}"
+
+
+if __name__ == "__main__":
+    result = unittest.main()
diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb
new file mode 100644
index 0000000000..e979110a30
--- /dev/null
+++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb	
@@ -0,0 +1,583 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "6166efcb-b7f8-424b-8015-cb646a764271",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "# Embedding Text with local (per node) NVIDIA TensorRT accelerator and GPU based Aproximate Nearest Neighbor (ANN)\n",
+    "\n",
+    "The demo extending existing [Azure OpenAI based demo](https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding%20and%20GPU%20based%20KNN.ipynb) when encoding is processed by OpenAI requests and KNN was using GPU based brute force search. This tutorial shows how to perform fast local embeddings using [multilingual E5 text embeddings](https://arxiv.org/abs/2402.05672) and fast aproximate Nearest Neighbor search using IVFFlat alcorithm. All tutorial stages accelerated by NVIDIA GPU using [NVIDIA TensorRT](https://developer.nvidia.com/tensorrt) and [Spark Rapids ML](https://github.com/NVIDIA/spark-rapids-ml). The tutorial folder contains two benchmark notebooks to demonstrate advantages of the presented GPU based approach compare to [previos CPU based demo](https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding.ipynb)\n",
+    "\n",
+    "The key prerequisites for this quickstart include a working Azure OpenAI resource, and an Apache Spark cluster with SynapseML installed. We suggest creating a Synapse workspace, but currently the notebook was running on Databricks GPU based cluster using Standard_NC24ads_A100_v4 with 6 workers. Databricks Runtime was 13.3 LTS ML (includes Apache Spark 3.4.1, GPU, Scala 2.12) with related [init_script](https://github.com/microsoft/SynapseML/tree/master/tools/init_scripts) to install all required packages.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "0444a03d-a701-4f59-b1a1-c4addb797d07",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "## Step 1: Prepare Environment\n",
+    "\n",
+    "It will imports required libraries and get initial settings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "d188d8ee-8913-4170-8d35-8490f833ae95",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import sys\n",
+    "import pyspark.sql.functions as F\n",
+    "from pyspark.sql.types import StructType, StructField, IntegerType, StringType\n",
+    "from pyspark.ml.linalg import Vectors\n",
+    "from pyspark.ml.linalg import VectorUDT\n",
+    "from spark_rapids_ml.knn import (\n",
+    "    ApproximateNearestNeighbors,\n",
+    "    ApproximateNearestNeighborsModel,\n",
+    ")\n",
+    "from synapse.ml.hf import HuggingFaceSentenceEmbedder\n",
+    "from synapse.ml.nn import KNN"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "42117315-a245-491a-b330-f8257d6fb35c",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "## Step 2: Load Input Data\n",
+    "\n",
+    "It will load public dataset and generate extra syntetic rows if set by size parameter\n",
+    "\n",
+    "The loaded dataset has 1000 rows. If you specify <i>number_of_input_rows</i> in [1..1000] it will cut extra rows if needed\n",
+    "\n",
+    "If <i>number_of_input_rows</i> in [1000..1000000] it will generate extra rows using cross join of original data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "6b6bdb2c-d492-4114-a7e9-0ef2832ac05c",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "file_path = \"wasbs://publicwasb@mmlspark.blob.core.windows.net/fine_food_reviews_1k.csv\"\n",
+    "\n",
+    "df = spark.read.options(inferSchema=\"True\", delimiter=\",\", header=True).csv(file_path)\n",
+    "df = df.withColumn(\n",
+    "    \"data\",\n",
+    "    F.format_string(\"Title: %s; Content: %s\", F.trim(df.Summary), F.trim(df.Text)),\n",
+    ")\n",
+    "\n",
+    "# Size of DF\n",
+    "number_of_input_rows = 100\n",
+    "\n",
+    "\n",
+    "# Check if the row count is less than 10\n",
+    "if number_of_input_rows <= 0 or number_of_input_rows >= 1000000:\n",
+    "    raise ValueError(f\"Limit is {number_of_input_rows}, which should be less than 1M.\")\n",
+    "\n",
+    "if number_of_input_rows > 1000:\n",
+    "\n",
+    "    # Cross-join the DataFrame with itself to create n x n pairs for string concatenation (synthetic data)\n",
+    "    cross_joined_df = df.crossJoin(df.withColumnRenamed(\"data\", \"data_\"))\n",
+    "\n",
+    "    # Create a new column 'result_vector' by concatenating the two source vectors\n",
+    "    tmp_df = cross_joined_df.withColumn(\n",
+    "        \"result_vector\",\n",
+    "        F.concat(F.col(\"data\"), F.lit(\". \\n\"), F.col(\"data_\")),\n",
+    "    )\n",
+    "\n",
+    "    # Select only the necessary columns and show the result\n",
+    "    tmp_df = tmp_df.select(\"result_vector\")\n",
+    "\n",
+    "    # Shuffle the DataFrame with a fixed seed to have close strings spreaded\n",
+    "    seed = 42\n",
+    "\n",
+    "    df = (\n",
+    "        tmp_df.withColumnRenamed(\"result_vector\", \"data\")\n",
+    "        .withColumn(\"id\", F.monotonically_increasing_id())\n",
+    "        .orderBy(F.rand(seed))\n",
+    "    )\n",
+    "\n",
+    "df = df.limit(number_of_input_rows).repartition(10).cache()\n",
+    "\n",
+    "print(f\"Loaded: {number_of_input_rows} rows\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "0c69ee56-172f-413b-a335-d15482fda55e",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "## Step 3: Generate Embeddings\n",
+    "\n",
+    "We will first generate embeddings using NVIDIA TensorRT optimized SentenceTransformer. In the demo you can use two fifferent HF models: intfloat/e5-large-v2 or sentence-transformers/all-MiniLM-L6-v2\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "8d7bd9db-79a1-4d46-a849-ac49c3de7b49",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# To create embedder with different models, uncomment the following line\n",
+    "# embedder = HuggingFaceSentenceEmbedder(modelName=\"intfloat/e5-large-v2\", inputCol=\"data\", outputCol=\"embeddings\", runtime=\"tensorrt\")\n",
+    "embedder = HuggingFaceSentenceEmbedder(\n",
+    "    modelName=\"sentence-transformers/all-MiniLM-L6-v2\",\n",
+    "    inputCol=\"data\",\n",
+    "    outputCol=\"embeddings\",\n",
+    "    runtime=\"tensorrt\",\n",
+    ")\n",
+    "\n",
+    "embeddings = embedder.transform(df).select(\"id\", \"embeddings\").cache()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "6885033f-6eea-4338-a632-2837582d91a1",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "## Step 4: Build the query against embeddings\n",
+    "\n",
+    "Get query embeddings running standard SentenceTransformer just on the driver. Convert embedding results to a data frame"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "23b83621-3f42-42ff-847e-97a4af2d3276",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Sample query\n",
+    "queries = [\"desserts\", \"disgusting\"]\n",
+    "ids = [1, 2]\n",
+    "\n",
+    "# Create DataFrame directly from the data and schema\n",
+    "query_df = spark.createDataFrame(\n",
+    "    list(zip(ids, queries)),\n",
+    "    StructType(\n",
+    "        [\n",
+    "            StructField(\"id\", IntegerType(), nullable=False),\n",
+    "            StructField(\"data\", StringType(), nullable=False),\n",
+    "        ]\n",
+    "    ),\n",
+    ")\n",
+    "\n",
+    "query_embeddings = embedder.transform(query_df).select(\"id\", \"embeddings\").cache()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "0154ce06-5875-4236-8178-030d45091445",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "## Step 5: Build a fast vector index to over review embeddings\n",
+    "\n",
+    "We will use fast NVIDIA Rapids indexer. This KNN implementation will work only on GPU. If you want to use CPU then switch to synapse.ml.nn CPU based KNN implementation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "c01d2c1e-837b-4525-a4d3-4938fd4221fb",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "RUN_ON_GPU = torch.cuda.is_available()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "aa0e4178-75e4-412b-940e-25d55b7396ce",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "if RUN_ON_GPU:\n",
+    "    rapids_knn_model = (\n",
+    "        ApproximateNearestNeighbors(k=5)\n",
+    "        .setInputCol(\"embeddings\")\n",
+    "        .setIdCol(\"id\")\n",
+    "        .fit(embeddings)\n",
+    "    )\n",
+    "else:\n",
+    "    array_to_vector_udf = udf(lambda array: Vectors.dense(array), VectorUDT())\n",
+    "    df_with_vectors = embeddings.withColumn(\n",
+    "        \"features\", array_to_vector_udf(embeddings[\"embeddings\"])\n",
+    "    )\n",
+    "    knn = (\n",
+    "        KNN()\n",
+    "        .setFeaturesCol(\"features\")\n",
+    "        .setValuesCol(\"id\")\n",
+    "        .setOutputCol(\"output\")\n",
+    "        .setK(10)\n",
+    "    )\n",
+    "    knn_model = knn.fit(df_with_vectors)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "521c9c8e-6422-49c7-95f3-6bca44a90cbb",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "## Step 6: Find top k Nearest Neighbors ON GPU\n",
+    "\n",
+    "We will use fast ANN [IVFFlat algorithm](https://developer.nvidia.com/blog/accelerated-vector-search-approximating-with-rapids-raft-ivf-flat/) from Rapids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "0fb3f3d7-bbb6-4105-bb86-b08fabba4ca4",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "if RUN_ON_GPU:\n",
+    "    (_, _, knn_df) = rapids_knn_model.kneighbors(query_embeddings)\n",
+    "else:\n",
+    "    array_to_vector_udf = udf(lambda array: Vectors.dense(array), VectorUDT())\n",
+    "    df_with_vectors = query_embeddings.withColumn(\n",
+    "        \"features\", array_to_vector_udf(query_embeddings[\"embeddings\"])\n",
+    "    )\n",
+    "    knn_df = knn_model.transform(df_with_vectors)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "9f30473c-ff6e-438a-bbce-11f1b0080a48",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "## Step 7: Collect and display results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "2ec6847c-3592-4645-aca1-0fc6d9e3ed0f",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "if RUN_ON_GPU:\n",
+    "    result_df = (\n",
+    "        knn_df.withColumn(\n",
+    "            \"zipped\", F.explode(F.arrays_zip(F.col(\"indices\"), F.col(\"distances\")))\n",
+    "        )\n",
+    "        .select(\n",
+    "            F.col(\"query_id\"),\n",
+    "            F.col(\"zipped.indices\").alias(\"id\"),\n",
+    "            F.col(\"zipped.distances\").alias(\"distance\"),\n",
+    "        )\n",
+    "        .join(df, on=\"id\", how=\"inner\")\n",
+    "        .select(\"query_id\", \"id\", \"data\", \"distance\")\n",
+    "    )\n",
+    "else:\n",
+    "    knn_df = knn_df.withColumnRenamed(\"data\", \"original_data\")\n",
+    "    result_df = (\n",
+    "        knn_df.withColumn(\"match\", F.explode(\"output\"))\n",
+    "        .join(df, df[\"id\"] == F.col(\"match.value\"))\n",
+    "        .select(\"original_data\", F.col(\"data\"), \"match.distance\")\n",
+    "    )\n",
+    "\n",
+    "display(result_df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "7b4c5a10-efd1-4d2d-b141-33e486943862",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "# Results\n",
+    "\n",
+    "The goal of this demo is to showcase two acceleration techniques: local (per node) embedding generation and approximate KNN. Compared to the original method, which relies on HTTP requests to the OpenAI model and CPU-based KNN. The new approach is significantly more scalable and provides substantial acceleration, especially for large input datasets.\n",
+    "\n",
+    "This is the comparison dureation results on 10 T4 GPU nodes for both approaches:\n",
+    "\n",
+    "![KNN Comparison](https://mmlspark.blob.core.windows.net/graphics/Documentation/knn_comparison.png)\n",
+    "\n",
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "application/vnd.databricks.v1+notebook": {
+   "dashboards": [],
+   "environmentMetadata": null,
+   "language": "python",
+   "notebookMetadata": {
+    "mostRecentlyExecutedCommandWithImplicitDF": {
+     "commandId": -1,
+     "dataframes": [
+      "_sqldf"
+     ]
+    },
+    "pythonIndentUnit": 2,
+    "widgetLayout": []
+   },
+   "notebookName": "Quickstart - Custom Embeddings and Approximate KNN on GPU",
+   "widgets": {}
+  },
+  "kernel_info": {
+   "name": "synapse_pyspark"
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  },
+  "save_output": true,
+  "synapse_widget": {
+   "state": {
+    "4bd0e60b-98ae-4bfe-98ee-6f0399ceb456": {
+     "persist_state": {
+      "view": {
+       "chartOptions": {
+        "aggregationType": "count",
+        "categoryFieldKeys": [
+         "0"
+        ],
+        "chartType": "bar",
+        "isStacked": false,
+        "seriesFieldKeys": [
+         "0"
+        ]
+       },
+       "tableOptions": {},
+       "type": "details"
+      }
+     },
+     "sync_state": {
+      "isSummary": false,
+      "language": "scala",
+      "table": {
+       "rows": [
+        {
+         "0": "Once upon a time",
+         "1": [
+          " there was a girl who had a dream of becoming a writer.\n\nShe started writing short stories"
+         ]
+        },
+        {
+         "0": "Hello my name is",
+         "1": [
+          "***** and I have a question about my cat\n\nHello, thank you for bringing your question to"
+         ]
+        },
+        {
+         "0": "The best code is code thats",
+         "1": [
+          " not there\n\nCommenting your code is important. Not only does it help you remember what you"
+         ]
+        }
+       ],
+       "schema": [
+        {
+         "key": "0",
+         "name": "prompt",
+         "type": "string"
+        },
+        {
+         "key": "1",
+         "name": "text",
+         "type": "ArrayType(StringType,true)"
+        }
+       ],
+       "truncated": false
+      }
+     },
+     "type": "Synapse.DataFrame"
+    }
+   },
+   "version": "0.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/environment.yml b/environment.yml
index 5302760212..477edb9c84 100644
--- a/environment.yml
+++ b/environment.yml
@@ -51,3 +51,4 @@ dependencies:
     - markdownify
     - traitlets
     - opencv-python
+    - sentence_transformers~=2.2.2
diff --git a/tools/init_scripts/init-rapidsml-cuda-11.8.sh b/tools/init_scripts/init-rapidsml-cuda-11.8.sh
index dd702d3e4b..bcb8fdc93e 100644
--- a/tools/init_scripts/init-rapidsml-cuda-11.8.sh
+++ b/tools/init_scripts/init-rapidsml-cuda-11.8.sh
@@ -1,11 +1,24 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 #!/bin/bash
-# set portion of path below after /dbfs/ to dbfs zip file location
-SPARK_RAPIDS_ML_ZIP=/dbfs/path/to/zip/file
 # IMPORTANT: specify RAPIDS_VERSION fully 23.10.0 and not 23.10
 # also in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0)
 # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2)
-RAPIDS_VERSION=23.10.0
+RAPIDS_VERSION=24.4.0
 SPARK_RAPIDS_VERSION=23.10.0
+SPARK_RAPIDSML_VERSION=24.6.0
 
 curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar
 
@@ -28,6 +41,8 @@ ln -s /usr/local/cuda-11.8 /usr/local/cuda
     rmm-cu11~=${RAPIDS_VERSION} \
     --extra-index-url=https://pypi.nvidia.com
 
-# install spark-rapids-ml
-/databricks/python/bin/pip install spark-rapids-ml
+# install model navigator
+/databricks/python/bin/pip install --extra-index-url https://pypi.nvidia.com onnxruntime-gpu "tensorrt" "triton-model-navigator==0.10.1" "sentence_transformers~=2.2.2" "urllib3<2" 
 
+# install spark-rapids-ml
+/databricks/python/bin/pip install spark-rapids-ml~=${SPARK_RAPIDSML_VERSION}
diff --git a/website/sidebars.js b/website/sidebars.js
index 0ab11ee7ed..5ef56a0f78 100644
--- a/website/sidebars.js
+++ b/website/sidebars.js
@@ -52,6 +52,7 @@ module.exports = {
                         "Explore Algorithms/OpenAI/OpenAI",
                         "Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding",
                         "Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding and GPU based KNN",
+                        "Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU",
                         "Explore Algorithms/OpenAI/Quickstart - Understand and Search Forms",
                     ],
                 },