diff --git a/.acrolinx-config.edn b/.acrolinx-config.edn
index 919922c789..2020cbb0f8 100644
--- a/.acrolinx-config.edn
+++ b/.acrolinx-config.edn
@@ -1,2 +1,2 @@
{:allowed-branchname-matches ["master" "release-.*"]
- :allowed-filename-matches ["notebooks" "website"]}
+ :allowed-filename-matches ["docs" "website"]}
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index a1188611b9..3fa7eb52e1 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -52,7 +52,7 @@ this process:
#### Implement documentation
-- Add a [sample Jupyter notebook](notebooks/) that shows the intended use
+- Add a [sample Jupyter notebook](docs/) that shows the intended use
case of your algorithm, with instructions in step-by-step manner. (The same
notebook could be used for testing the code.)
- Add in-line ScalaDoc comments to your source code, to generate the [API
diff --git a/build.sbt b/build.sbt
index c55dfd3939..2e8d49d85f 100644
--- a/build.sbt
+++ b/build.sbt
@@ -381,11 +381,11 @@ publishBadges := {
uploadBadge("master version", version.value, "blue", "master_version3.svg")
}
-val uploadNotebooks = TaskKey[Unit]("uploadNotebooks", "upload notebooks to blob storage")
+val uploadNotebooks = TaskKey[Unit]("uploadNotebooks", "upload docs to blob storage")
uploadNotebooks := {
- val localNotebooksFolder = join(baseDirectory.value.toString, "notebooks").toString
+ val localNotebooksFolder = join(baseDirectory.value.toString, "docs").toString
val blobNotebooksFolder = version.value
- uploadToBlob(localNotebooksFolder, blobNotebooksFolder, "notebooks")
+ uploadToBlob(localNotebooksFolder, blobNotebooksFolder, "docs")
}
val settings = Seq(
@@ -493,8 +493,8 @@ setupTask := {
val convertNotebooks = TaskKey[Unit]("convertNotebooks", "convert notebooks to markdown for website display")
convertNotebooks := {
- runCmdStr("python -m docs.python.documentprojection " +
- "--customchannels docs/python/synapseml_channels -c website . docs/manifest.yaml -p")
+ runCmd(Seq("pip", "install", "-e", "."), wd=join(baseDirectory.value, "tools/docgen"))
+ runCmd(Seq("python", "__main__.py"), wd=join(baseDirectory.value, "tools/docgen/docgen"))
}
val testWebsiteDocs = TaskKey[Unit]("testWebsiteDocs",
diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/core/env/FileUtilities.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/core/env/FileUtilities.scala
index 7d82f1fc15..baecf3d8bd 100644
--- a/core/src/main/scala/com/microsoft/azure/synapse/ml/core/env/FileUtilities.scala
+++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/core/env/FileUtilities.scala
@@ -68,6 +68,13 @@ object FileUtilities {
()
}
+ def copyAndRenameFile(from: File, toDir: File, newName: String, overwrite: Boolean = false): Unit = {
+ Files.copy(from.toPath, new File(toDir, newName).toPath,
+ (if (overwrite) Seq(StandardCopyOption.REPLACE_EXISTING)
+ else Seq()): _*)
+ ()
+ }
+
// Perhaps this should move into a more specific place, not a generic file utils thing
def zipFolder(dir: File, out: File): Unit = {
import java.io.{BufferedInputStream, FileInputStream, FileOutputStream}
diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala
index 1ac4872632..d008c837ba 100644
--- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala
+++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala
@@ -86,17 +86,17 @@ object DatabricksUtilities {
// Execution Params
val TimeoutInMillis: Int = 40 * 60 * 1000
- val NotebookFiles: Array[File] = FileUtilities.recursiveListFiles(
- FileUtilities.join(
- BuildInfo.baseDirectory.getParent, "notebooks", "features").getCanonicalFile)
+ val DocsDir = FileUtilities.join(BuildInfo.baseDirectory.getParent, "docs").getCanonicalFile()
+ val NotebookFiles: Array[File] = FileUtilities.recursiveListFiles(DocsDir)
+ .filter(_.toString.endsWith(".ipynb"))
val ParallelizableNotebooks: Seq[File] = NotebookFiles.filterNot(_.isDirectory)
val CPUNotebooks: Seq[File] = ParallelizableNotebooks
- .filterNot(_.getAbsolutePath.contains("simple_deep_learning"))
+ .filterNot(_.getAbsolutePath.contains("Fine-tune"))
.filterNot(_.getAbsolutePath.contains("Explanation Dashboard")) // TODO Remove this exclusion
- val GPUNotebooks: Seq[File] = ParallelizableNotebooks.filter(_.getAbsolutePath.contains("simple_deep_learning"))
+ val GPUNotebooks: Seq[File] = ParallelizableNotebooks.filter(_.getAbsolutePath.contains("Fine-tune"))
def databricksGet(path: String): JsValue = {
val request = new HttpGet(BaseURL + path)
@@ -336,13 +336,15 @@ object DatabricksUtilities {
//scalastyle:on cyclomatic.complexity
def uploadAndSubmitNotebook(clusterId: String, notebookFile: File): DatabricksNotebookRun = {
- val destination: String = Folder + "/" + notebookFile.getName
+ val dirPaths = DocsDir.toURI.relativize(notebookFile.getParentFile.toURI).getPath
+ val folderToCreate = Folder + "/" + dirPaths
+ println(s"Creating folder $folderToCreate")
+ workspaceMkDir(folderToCreate)
+ val destination: String = folderToCreate + notebookFile.getName
uploadNotebook(notebookFile, destination)
val runId: Int = submitRun(clusterId, destination)
val run: DatabricksNotebookRun = DatabricksNotebookRun(runId, notebookFile.getName)
-
println(s"Successfully submitted job run id ${run.runId} for notebook ${run.notebookName}")
-
run
}
@@ -413,9 +415,6 @@ abstract class DatabricksTestHelper extends TestBase {
assert(areLibrariesInstalled(clusterId))
}
- println(s"Creating folder $Folder")
- workspaceMkDir(Folder)
-
println(s"Submitting jobs")
val parNotebookRuns: Seq[DatabricksNotebookRun] = notebooks.map(uploadAndSubmitNotebook(clusterId, _))
parNotebookRuns.foreach(notebookRun => jobIdsToCancel.append(notebookRun.runId))
diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SharedNotebookE2ETestUtilities.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SharedNotebookE2ETestUtilities.scala
index 651b74916b..df72acab95 100644
--- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SharedNotebookE2ETestUtilities.scala
+++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SharedNotebookE2ETestUtilities.scala
@@ -4,34 +4,76 @@
package com.microsoft.azure.synapse.ml.nbtest
import com.microsoft.azure.synapse.ml.build.BuildInfo
-import com.microsoft.azure.synapse.ml.core.env.FileUtilities
+import com.microsoft.azure.synapse.ml.core.env.{FileUtilities, StreamUtilities}
import org.apache.commons.io.FileUtils
import java.io.File
import java.lang.ProcessBuilder.Redirect
import scala.sys.process._
-
+import scala.io.Source
+import java.io.{BufferedWriter, File, FileWriter}
object SharedNotebookE2ETestUtilities {
val ResourcesDirectory = new File(getClass.getResource("/").toURI)
val NotebooksDir = new File(ResourcesDirectory, "generated-notebooks")
+ val NotebookPreamble: String =
+ """
+ |# In[ ]:
+ |
+ |
+ |# This cell ensures make magic command like '%pip install' works on synapse scheduled spark jobs
+ |from synapse.ml.core.platform import running_on_synapse
+ |
+ |if running_on_synapse():
+ | from IPython import get_ipython
+ | from IPython.terminal.interactiveshell import TerminalInteractiveShell
+ | from synapse.ml.core.platform import materializing_display as display
+ | from pyspark.sql import SparkSession
+ |
+ | spark = SparkSession.builder.getOrCreate()
+ | try:
+ | shell = TerminalInteractiveShell.instance()
+ | except:
+ | pass
+ |
+ |""".stripMargin
+
+ def insertTextInFile(file: File, textToPrepend: String, locToInsert: Int): Unit = {
+ val existingLines = StreamUtilities.using(Source.fromFile(file)) { s =>
+ s.getLines().toList
+ }.get
+ val linesBefore = existingLines.take(locToInsert)
+ val linesAfter = existingLines.takeRight(existingLines.length - locToInsert)
+ val linesInMiddle = textToPrepend.split("\n")
+ val newText = (linesBefore ++ linesInMiddle ++ linesAfter).mkString("\n")
+ StreamUtilities.using(new BufferedWriter(new FileWriter(file))) { writer =>
+ writer.write(newText)
+ }
+ }
def generateNotebooks(): Unit = {
cleanUpGeneratedNotebooksDir()
- FileUtilities.recursiveListFiles(FileUtilities
- .join(BuildInfo.baseDirectory.getParent, "notebooks/features")
- .getCanonicalFile)
+ val docsDir = FileUtilities.join(BuildInfo.baseDirectory.getParent, "docs").getCanonicalFile
+ val newFiles = FileUtilities.recursiveListFiles(docsDir)
.filter(_.getName.endsWith(".ipynb"))
.map { f =>
- FileUtilities.copyFile(f, NotebooksDir, true)
- val newFile = new File(NotebooksDir, f.getName)
- val targetName = new File(NotebooksDir, f.getName.replace(" ", "").replace("-", ""))
- newFile.renameTo(targetName)
- targetName
+ val relative = docsDir.toURI.relativize(f.toURI).getPath
+ val newName = relative
+ .replace("/", "")
+ .replace(" ", "")
+ .replace("-", "")
+ .replace(",", "")
+ FileUtilities.copyAndRenameFile(f, NotebooksDir, newName, true)
+ new File(NotebooksDir, newName)
}
runCmd(activateCondaEnv ++ Seq("jupyter", "nbconvert", "--to", "python", "*.ipynb"), NotebooksDir)
+
+ newFiles.foreach { f =>
+ insertTextInFile(new File(f.getPath.replace(".ipynb", ".py")), NotebookPreamble, 2)
+ }
+
}
def cleanUpGeneratedNotebooksDir(): Unit = {
diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala
index 5dfd76c4ae..d7916b8eaf 100644
--- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala
+++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala
@@ -44,11 +44,12 @@ class SynapseTests extends TestBase {
val selectedPythonFiles: Array[File] = FileUtilities.recursiveListFiles(SharedNotebookE2ETestUtilities.NotebooksDir)
.filter(_.getAbsolutePath.endsWith(".py"))
- .filterNot(_.getAbsolutePath.contains("DeepLearningDeepTextClassification")) // Excluded by design task 1829306
- .filterNot(_.getAbsolutePath.contains("DeepLearningDeepVisionClassification")) // Excluded by design task 1829306
- .filterNot(_.getAbsolutePath.contains("VowpalWabbitClassificationusingVWnativeFormat"))
+ .filterNot(_.getAbsolutePath.contains("Finetune")) // Excluded by design task 1829306
+ .filterNot(_.getAbsolutePath.contains("VWnativeFormat"))
.filterNot(_.getAbsolutePath.contains("VowpalWabbitMulticlassclassification")) // Wait for Synpase fix
.filterNot(_.getAbsolutePath.contains("Langchain")) // Wait for Synpase fix
+ .filterNot(_.getAbsolutePath.contains("SetupCognitive")) // No code to run
+ .filterNot(_.getAbsolutePath.contains("CreateaSparkCluster")) // No code to run
.sortBy(_.getAbsolutePath)
val expectedPoolCount: Int = selectedPythonFiles.length
diff --git a/website/versioned_docs/version-0.10.1/features/spark_serving/about.md b/docs/Deploy Models/Overview.md
similarity index 99%
rename from website/versioned_docs/version-0.10.1/features/spark_serving/about.md
rename to docs/Deploy Models/Overview.md
index 1aaeadde49..4d0f54ea18 100644
--- a/website/versioned_docs/version-0.10.1/features/spark_serving/about.md
+++ b/docs/Deploy Models/Overview.md
@@ -33,7 +33,7 @@ sidebar_label: About
### Jupyter Notebook Examples
-- [Deploy a classifier trained on the Adult Census Dataset](../SparkServing%20-%20Deploying%20a%20Classifier)
+- [Deploy a classifier trained on the Adult Census Dataset](../Quickstart%20-%20Deploying%20a%20Classifier)
- More coming soon!
### Spark Serving Hello World
diff --git a/notebooks/features/spark_serving/SparkServing - Deploying a Classifier.ipynb b/docs/Deploy Models/Quickstart - Deploying a Classifier.ipynb
similarity index 92%
rename from notebooks/features/spark_serving/SparkServing - Deploying a Classifier.ipynb
rename to docs/Deploy Models/Quickstart - Deploying a Classifier.ipynb
index 92094e7743..c2a6a7dd81 100644
--- a/notebooks/features/spark_serving/SparkServing - Deploying a Classifier.ipynb
+++ b/docs/Deploy Models/Quickstart - Deploying a Classifier.ipynb
@@ -9,31 +9,6 @@
"First, we import needed packages:"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "import sys\n",
- "import numpy as np\n",
- "import pandas as pd"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
diff --git a/notebooks/features/cognitive_services/CognitiveServices - Advanced Usage Async, Batching, and Multi-Key.ipynb b/docs/Explore Algorithms/AI Services/Advanced Usage - Async, Batching, and Multi-Key.ipynb
similarity index 93%
rename from notebooks/features/cognitive_services/CognitiveServices - Advanced Usage Async, Batching, and Multi-Key.ipynb
rename to docs/Explore Algorithms/AI Services/Advanced Usage - Async, Batching, and Multi-Key.ipynb
index 6f7d8b12d4..e80179519b 100644
--- a/notebooks/features/cognitive_services/CognitiveServices - Advanced Usage Async, Batching, and Multi-Key.ipynb
+++ b/docs/Explore Algorithms/AI Services/Advanced Usage - Async, Batching, and Multi-Key.ipynb
@@ -33,14 +33,7 @@
{
"cell_type": "code",
"source": [
- "import os\n",
- "from pyspark.sql import SparkSession\n",
- "from synapse.ml.core.platform import running_on_synapse, find_secret\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "if running_on_synapse():\n",
- " from notebookutils.visualization import display\n",
+ "from synapse.ml.core.platform import find_secret\n",
"\n",
"service_key = find_secret(\"cognitive-api-key\")\n",
"service_loc = \"eastus\""
@@ -264,7 +257,8 @@
{
"cell_type": "markdown",
"source": [
- "#### Faster without extra hardware:\n"
+ "#### Faster without extra hardware:\n",
+ ""
],
"metadata": {
"application/vnd.databricks.v1+cell": {
@@ -398,7 +392,9 @@
{
"cell_type": "markdown",
"source": [
- "## Learn More\n- [Explore other cogntive services](https://microsoft.github.io/SynapseML/docs/features/cognitive_services/CognitiveServices%20-%20Overview/)\n- [Read our paper \"Large-Scale Intelligent Microservices\"](https://arxiv.org/abs/2009.08044)"
+ "## Learn More\n",
+ "- [Explore other cogntive services](https://microsoft.github.io/SynapseML/docs/features/cognitive_services/CognitiveServices%20-%20Overview/)\n",
+ "- [Read our paper \"Large-Scale Intelligent Microservices\"](https://arxiv.org/abs/2009.08044)"
],
"metadata": {
"application/vnd.databricks.v1+cell": {
@@ -421,8 +417,13 @@
"language": "python",
"widgets": {},
"notebookOrigID": 3743502060540796
+ },
+ "kernelspec": {
+ "name": "python3",
+ "language": "python",
+ "display_name": "Python 3 (ipykernel)"
}
},
"nbformat": 4,
"nbformat_minor": 0
-}
\ No newline at end of file
+}
diff --git a/notebooks/features/geospatial_services/GeospatialServices - Overview.ipynb b/docs/Explore Algorithms/AI Services/Geospatial Services.ipynb
similarity index 96%
rename from notebooks/features/geospatial_services/GeospatialServices - Overview.ipynb
rename to docs/Explore Algorithms/AI Services/Geospatial Services.ipynb
index 108c54c53a..b3159957b1 100644
--- a/notebooks/features/geospatial_services/GeospatialServices - Overview.ipynb
+++ b/docs/Explore Algorithms/AI Services/Geospatial Services.ipynb
@@ -53,12 +53,8 @@
"metadata": {},
"outputs": [],
"source": [
- "from pyspark.sql.functions import udf, col\n",
"from pyspark.sql.types import StructType, StructField, DoubleType\n",
- "from pyspark.sql.functions import lit\n",
- "from pyspark.ml import PipelineModel\n",
"from pyspark.sql.functions import col\n",
- "import os\n",
"import requests\n",
"from requests.adapters import HTTPAdapter\n",
"from requests.packages.urllib3.util.retry import Retry\n",
@@ -75,21 +71,6 @@
"http.mount(\"http://\", adapter)"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from pyspark.sql import SparkSession\n",
- "from synapse.ml.core.platform import *\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
- "from synapse.ml.core.platform import materializing_display as display"
- ]
- },
{
"cell_type": "code",
"execution_count": null,
@@ -98,6 +79,7 @@
"source": [
"from synapse.ml.cognitive import *\n",
"from synapse.ml.geospatial import *\n",
+ "from synapse.ml.core.platform import *\n",
"\n",
"# An Azure Maps account key\n",
"maps_key = find_secret(\"azuremaps-api-key\")"
diff --git a/notebooks/features/cognitive_services/CognitiveServices - Multivariate Anomaly Detection.ipynb b/docs/Explore Algorithms/AI Services/Multivariate Anomaly Detection.ipynb
similarity index 98%
rename from notebooks/features/cognitive_services/CognitiveServices - Multivariate Anomaly Detection.ipynb
rename to docs/Explore Algorithms/AI Services/Multivariate Anomaly Detection.ipynb
index 80e7c4199a..f6c97f49e1 100644
--- a/notebooks/features/cognitive_services/CognitiveServices - Multivariate Anomaly Detection.ipynb
+++ b/docs/Explore Algorithms/AI Services/Multivariate Anomaly Detection.ipynb
@@ -38,20 +38,6 @@
"Let's start by setting up the environment variables for our service keys. The next cell sets the `ANOMALY_API_KEY` and the `BLOB_CONNECTION_STRING` environment variables based on the values stored in our Azure Key Vault. If you're running this tutorial in your own environment, make sure you set these environment variables before you proceed."
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import os\n",
- "from pyspark.sql import SparkSession\n",
- "from synapse.ml.core.platform import find_secret\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
@@ -65,6 +51,8 @@
"metadata": {},
"outputs": [],
"source": [
+ "from synapse.ml.core.platform import find_secret\n",
+ "\n",
"# An Anomaly Dectector subscription key\n",
"anomalyKey = find_secret(\"anomaly-api-key\") # use your own anomaly api key\n",
"# Your storage account name\n",
diff --git a/notebooks/features/cognitive_services/CognitiveServices - Overview.ipynb b/docs/Explore Algorithms/AI Services/Overview.ipynb
similarity index 98%
rename from notebooks/features/cognitive_services/CognitiveServices - Overview.ipynb
rename to docs/Explore Algorithms/AI Services/Overview.ipynb
index 6261a360dd..0a95ce0dcf 100644
--- a/notebooks/features/cognitive_services/CognitiveServices - Overview.ipynb
+++ b/docs/Explore Algorithms/AI Services/Overview.ipynb
@@ -131,23 +131,7 @@
"from requests import Request\n",
"from pyspark.sql.functions import lit\n",
"from pyspark.ml import PipelineModel\n",
- "from pyspark.sql.functions import col\n",
- "import os"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from pyspark.sql import SparkSession\n",
- "from synapse.ml.core.platform import *\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
- "from synapse.ml.core.platform import materializing_display as display"
+ "from pyspark.sql.functions import col"
]
},
{
@@ -157,6 +141,7 @@
"outputs": [],
"source": [
"from synapse.ml.cognitive import *\n",
+ "from synapse.ml.core.platform import *\n",
"\n",
"# A general Cognitive Services key for Text Analytics, Computer Vision and Form Recognizer (or use separate keys that belong to each service)\n",
"service_key = find_secret(\n",
diff --git a/notebooks/features/cognitive_services/CognitiveServices - Celebrity Quote Analysis.ipynb b/docs/Explore Algorithms/AI Services/Quickstart - Analyze Celebrity Quotes.ipynb
similarity index 98%
rename from notebooks/features/cognitive_services/CognitiveServices - Celebrity Quote Analysis.ipynb
rename to docs/Explore Algorithms/AI Services/Quickstart - Analyze Celebrity Quotes.ipynb
index 9a7dbf6a2f..5ecb192f2b 100644
--- a/notebooks/features/cognitive_services/CognitiveServices - Celebrity Quote Analysis.ipynb
+++ b/docs/Explore Algorithms/AI Services/Quickstart - Analyze Celebrity Quotes.ipynb
@@ -26,11 +26,8 @@
"from pyspark.ml import PipelineModel\n",
"from pyspark.sql.functions import col, udf\n",
"from pyspark.ml.feature import SQLTransformer\n",
- "from pyspark.sql import SparkSession\n",
"from synapse.ml.core.platform import find_secret\n",
"\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
"# put your service keys here\n",
"cognitive_key = find_secret(\"cognitive-api-key\")\n",
"cognitive_location = \"eastus\"\n",
diff --git a/notebooks/features/cognitive_services/CognitiveServices - Analyze Text.ipynb b/docs/Explore Algorithms/AI Services/Quickstart - Analyze Text.ipynb
similarity index 86%
rename from notebooks/features/cognitive_services/CognitiveServices - Analyze Text.ipynb
rename to docs/Explore Algorithms/AI Services/Quickstart - Analyze Text.ipynb
index 8132ca9291..4a4ad4c7c3 100644
--- a/notebooks/features/cognitive_services/CognitiveServices - Analyze Text.ipynb
+++ b/docs/Explore Algorithms/AI Services/Quickstart - Analyze Text.ipynb
@@ -14,14 +14,7 @@
"metadata": {},
"outputs": [],
"source": [
- "import os\n",
- "from pyspark.sql import SparkSession\n",
- "from synapse.ml.core.platform import running_on_synapse, find_secret\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "if running_on_synapse():\n",
- " from notebookutils.visualization import display\n",
+ "from synapse.ml.core.platform import find_secret\n",
"\n",
"cognitive_key = find_secret(\"cognitive-api-key\")\n",
"cognitive_location = \"eastus\""
@@ -105,4 +98,4 @@
},
"nbformat": 4,
"nbformat_minor": 0
-}
\ No newline at end of file
+}
diff --git a/notebooks/features/cognitive_services/CognitiveServices - Custom Search for Art.ipynb b/docs/Explore Algorithms/AI Services/Quickstart - Creare a Visual Search Engine.ipynb
similarity index 88%
rename from notebooks/features/cognitive_services/CognitiveServices - Custom Search for Art.ipynb
rename to docs/Explore Algorithms/AI Services/Quickstart - Creare a Visual Search Engine.ipynb
index 9255985c58..4b3a72a2dc 100644
--- a/notebooks/features/cognitive_services/CognitiveServices - Custom Search for Art.ipynb
+++ b/docs/Explore Algorithms/AI Services/Quickstart - Creare a Visual Search Engine.ipynb
@@ -16,38 +16,12 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 4,
"source": [
"import os, sys, time, json, requests\n",
- "from pyspark.ml import Transformer, Estimator, Pipeline\n",
- "from pyspark.ml.feature import SQLTransformer\n",
- "from pyspark.sql.functions import lit, udf, col, split"
- ],
- "outputs": [],
- "metadata": {
- "collapsed": true
- }
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "source": [
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
+ "from pyspark.sql.functions import lit, udf, col, split\n",
"from synapse.ml.core.platform import *\n",
"\n",
- "from synapse.ml.core.platform import materializing_display as display"
- ],
- "outputs": [],
- "metadata": {}
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "source": [
"cognitive_key = find_secret(\"cognitive-api-key\")\n",
"cognitive_loc = \"eastus\"\n",
"azure_search_key = find_secret(\"azure-search-key\")\n",
diff --git a/notebooks/features/cognitive_services/CognitiveServices - Create Audiobooks.ipynb b/docs/Explore Algorithms/AI Services/Quickstart - Create Audiobooks.ipynb
similarity index 97%
rename from notebooks/features/cognitive_services/CognitiveServices - Create Audiobooks.ipynb
rename to docs/Explore Algorithms/AI Services/Quickstart - Create Audiobooks.ipynb
index a763d8b84a..0c79a4eae4 100644
--- a/notebooks/features/cognitive_services/CognitiveServices - Create Audiobooks.ipynb
+++ b/docs/Explore Algorithms/AI Services/Quickstart - Create Audiobooks.ipynb
@@ -33,14 +33,10 @@
{
"cell_type": "code",
"source": [
- "from pyspark.sql import SparkSession\n",
"from synapse.ml.core.platform import *\n",
"\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
"if running_on_synapse():\n",
" from notebookutils import mssparkutils\n",
- " from notebookutils.visualization import display\n",
"\n",
"# Fill this in with your cognitive service information\n",
"service_key = find_secret(\n",
diff --git a/notebooks/features/geospatial_services/GeospatialServices - Flooding Risk.ipynb b/docs/Explore Algorithms/AI Services/Quickstart - Flooding Risk.ipynb
similarity index 96%
rename from notebooks/features/geospatial_services/GeospatialServices - Flooding Risk.ipynb
rename to docs/Explore Algorithms/AI Services/Quickstart - Flooding Risk.ipynb
index 6ee75fc2dd..a2dc21ff28 100644
--- a/notebooks/features/geospatial_services/GeospatialServices - Flooding Risk.ipynb
+++ b/docs/Explore Algorithms/AI Services/Quickstart - Flooding Risk.ipynb
@@ -29,7 +29,6 @@
"metadata": {},
"outputs": [],
"source": [
- "import os\n",
"import json\n",
"import time\n",
"import requests\n",
@@ -45,15 +44,7 @@
"adapter = HTTPAdapter(max_retries=retry_strategy)\n",
"http = requests.Session()\n",
"http.mount(\"https://\", adapter)\n",
- "http.mount(\"http://\", adapter)\n",
- "\n",
- "from pyspark.sql import SparkSession\n",
- "from synapse.ml.core.platform import *\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
- "from synapse.ml.core.platform import materializing_display as display"
+ "http.mount(\"http://\", adapter)"
]
},
{
@@ -62,6 +53,8 @@
"metadata": {},
"outputs": [],
"source": [
+ "from synapse.ml.core.platform import *\n",
+ "\n",
"# Azure Maps account key\n",
"maps_key = find_secret(\"azuremaps-api-key\") # Replace this with your azure maps key\n",
"\n",
diff --git a/notebooks/features/cognitive_services/CognitiveServices - Predictive Maintenance.ipynb b/docs/Explore Algorithms/AI Services/Quickstart - Predictive Maintenance.ipynb
similarity index 82%
rename from notebooks/features/cognitive_services/CognitiveServices - Predictive Maintenance.ipynb
rename to docs/Explore Algorithms/AI Services/Quickstart - Predictive Maintenance.ipynb
index daaf26720a..0611541843 100644
--- a/notebooks/features/cognitive_services/CognitiveServices - Predictive Maintenance.ipynb
+++ b/docs/Explore Algorithms/AI Services/Quickstart - Predictive Maintenance.ipynb
@@ -44,10 +44,6 @@
"source": [
"import os\n",
"from synapse.ml.core.platform import find_secret\n",
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
"\n",
"service_key = find_secret(\"anomaly-api-key\") # Paste your anomaly detector key here\n",
"location = \"westus2\" # Paste your anomaly detector location here"
@@ -59,7 +55,9 @@
{
"cell_type": "markdown",
"source": [
- "## Read data into a DataFrame\n\nNext, let's read the IoTSignals file into a DataFrame. Open a new notebook in your Synapse workspace and create a DataFrame from the file."
+ "## Read data into a DataFrame\n",
+ "\n",
+ "Next, let's read the IoTSignals file into a DataFrame. Open a new notebook in your Synapse workspace and create a DataFrame from the file."
],
"metadata": {}
},
@@ -79,7 +77,9 @@
{
"cell_type": "markdown",
"source": [
- "### Run anomaly detection using Cognitive Services on Spark\n\nThe goal is to find instances where the signals from the IoT devices were outputting anomalous values so that we can see when something is going wrong and do predictive maintenance. To do that, let's use Anomaly Detector on Spark:"
+ "### Run anomaly detection using Cognitive Services on Spark\n",
+ "\n",
+ "The goal is to find instances where the signals from the IoT devices were outputting anomalous values so that we can see when something is going wrong and do predictive maintenance. To do that, let's use Anomaly Detector on Spark:"
],
"metadata": {}
},
@@ -133,14 +133,22 @@
{
"cell_type": "markdown",
"source": [
- "This cell should yield a result that looks like:\n\n| timestamp | value | deviceId | isAnomaly |\n|:--------------------|--------:|:-----------|:------------|\n| 2020-05-01 18:33:51 | 3174 | dev-7 | False |\n| 2020-05-01 18:33:52 | 2976 | dev-7 | False |\n| 2020-05-01 18:33:53 | 2714 | dev-7 | False |"
+ "This cell should yield a result that looks like:\n",
+ "\n",
+ "| timestamp | value | deviceId | isAnomaly |\n",
+ "|:--------------------|--------:|:-----------|:------------|\n",
+ "| 2020-05-01 18:33:51 | 3174 | dev-7 | False |\n",
+ "| 2020-05-01 18:33:52 | 2976 | dev-7 | False |\n",
+ "| 2020-05-01 18:33:53 | 2714 | dev-7 | False |"
],
"metadata": {}
},
{
"cell_type": "markdown",
"source": [
- "## Visualize anomalies for one of the devices\n\nIoTSignals.csv has signals from multiple IoT devices. We'll focus on a specific device and visualize anomalous outputs from the device."
+ "## Visualize anomalies for one of the devices\n",
+ "\n",
+ "IoTSignals.csv has signals from multiple IoT devices. We'll focus on a specific device and visualize anomalous outputs from the device."
],
"metadata": {}
},
@@ -229,7 +237,13 @@
{
"cell_type": "markdown",
"source": [
- "If successful, your output will look like this:\n\n![Anomaly Detector Plot](https://github.com/MicrosoftDocs/azure-docs/raw/master/articles/cognitive-services/big-data/media/anomaly-output.png)\n\n## Next steps\n\nLearn how to do predictive maintenance at scale with Azure Cognitive Services, Azure Synapse Analytics, and Azure CosmosDB. For more information, see the full sample on [GitHub](https://github.com/Azure-Samples/cosmosdb-synapse-link-samples)."
+ "If successful, your output will look like this:\n",
+ "\n",
+ "![Anomaly Detector Plot](https://github.com/MicrosoftDocs/azure-docs/raw/master/articles/cognitive-services/big-data/media/anomaly-output.png)\n",
+ "\n",
+ "## Next steps\n",
+ "\n",
+ "Learn how to do predictive maintenance at scale with Azure Cognitive Services, Azure Synapse Analytics, and Azure CosmosDB. For more information, see the full sample on [GitHub](https://github.com/Azure-Samples/cosmosdb-synapse-link-samples)."
],
"metadata": {}
}
diff --git a/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb b/docs/Explore Algorithms/Anomaly Detection/Quickstart - Isolation Forests.ipynb
similarity index 97%
rename from notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb
rename to docs/Explore Algorithms/Anomaly Detection/Quickstart - Isolation Forests.ipynb
index 41c7dd1f3e..58e719284e 100644
--- a/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb
+++ b/docs/Explore Algorithms/Anomaly Detection/Quickstart - Isolation Forests.ipynb
@@ -32,11 +32,7 @@
"execution_count": null,
"outputs": [],
"source": [
- "import subprocess\n",
- "import sys\n",
- "\n",
- "for package in [\"sqlparse\", \"raiwidgets\", \"interpret-community\"]:\n",
- " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", package])"
+ "%pip install sqlparse raiwidgets interpret-community"
],
"metadata": {
"collapsed": false,
@@ -76,7 +72,6 @@
"import mlflow\n",
"\n",
"from pyspark.sql import functions as F\n",
- "from pyspark.sql import SparkSession\n",
"from pyspark.ml.feature import VectorAssembler\n",
"from pyspark.sql.types import *\n",
"from pyspark.ml import Pipeline\n",
@@ -161,29 +156,9 @@
"# MLFlow experiment\n",
"artifact_path = \"isolationforest\"\n",
"experiment_name = f\"/Shared/isolation_forest_experiment-{str(uuid.uuid1())}/\"\n",
- "model_name = f\"isolation-forest-model\"\n",
- "if running_on_synapse():\n",
- " from synapse.ml.core.platform import materializing_display as display\n",
- "\n",
- " # use regular display when running on interactive notebook\n",
- " # from notebookutils.visualization import display"
+ "model_name = f\"isolation-forest-model\""
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "outputs": [],
- "source": [
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()"
- ],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%%\n"
- }
- }
- },
{
"cell_type": "markdown",
"metadata": {
@@ -1030,4 +1005,4 @@
},
"nbformat": 4,
"nbformat_minor": 1
-}
\ No newline at end of file
+}
diff --git a/website/docs/features/causal_inference/about.md b/docs/Explore Algorithms/Causal Inference/Overview.md
similarity index 97%
rename from website/docs/features/causal_inference/about.md
rename to docs/Explore Algorithms/Causal Inference/Overview.md
index a4664dba24..2d5384d4de 100644
--- a/website/docs/features/causal_inference/about.md
+++ b/docs/Explore Algorithms/Causal Inference/Overview.md
@@ -1,7 +1,7 @@
---
-title: Causal Inference
+title: Overview
hide_title: true
-sidebar_label: About
+sidebar_label: Overview
---
## Causal Inference on Apache Spark
@@ -58,4 +58,4 @@ dmlModel.getConfidenceInterval()
```
For an end to end application, check out the DoubleMLEstimator [notebook
-example](../Effects%20of%20Outreach%20Efforts).
+example](../Quickstart%20-%20Measure%20Causal%20Effects).
diff --git a/notebooks/features/causal_inference/Effects of Outreach Efforts.ipynb b/docs/Explore Algorithms/Causal Inference/Quickstart - Measure Causal Effects.ipynb
similarity index 97%
rename from notebooks/features/causal_inference/Effects of Outreach Efforts.ipynb
rename to docs/Explore Algorithms/Causal Inference/Quickstart - Measure Causal Effects.ipynb
index 8191154316..54393e22b5 100644
--- a/notebooks/features/causal_inference/Effects of Outreach Efforts.ipynb
+++ b/docs/Explore Algorithms/Causal Inference/Quickstart - Measure Causal Effects.ipynb
@@ -104,18 +104,6 @@
"| Revenue | Y | $ Revenue from customer given by the amount of software purchased |\n"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()"
- ]
- },
{
"cell_type": "code",
"execution_count": 2,
diff --git a/notebooks/features/causal_inference/Heterogeneous Effects of Outreach Efforts.ipynb b/docs/Explore Algorithms/Causal Inference/Quickstart - Measure Heterogeneous Effects.ipynb
similarity index 97%
rename from notebooks/features/causal_inference/Heterogeneous Effects of Outreach Efforts.ipynb
rename to docs/Explore Algorithms/Causal Inference/Quickstart - Measure Heterogeneous Effects.ipynb
index 959ca8365a..d814880289 100644
--- a/notebooks/features/causal_inference/Heterogeneous Effects of Outreach Efforts.ipynb
+++ b/docs/Explore Algorithms/Causal Inference/Quickstart - Measure Heterogeneous Effects.ipynb
@@ -104,18 +104,6 @@
"| Revenue | Y | $ Revenue from customer given by the amount of software purchased |\n"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()"
- ]
- },
{
"cell_type": "code",
"execution_count": null,
@@ -331,4 +319,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
-}
\ No newline at end of file
+}
diff --git a/notebooks/features/classification/Classification - Before and After SynapseML.ipynb b/docs/Explore Algorithms/Classification/Quickstart - SparkML vs SynapseML.ipynb
similarity index 97%
rename from notebooks/features/classification/Classification - Before and After SynapseML.ipynb
rename to docs/Explore Algorithms/Classification/Quickstart - SparkML vs SynapseML.ipynb
index 6983640f05..e55932c9f8 100644
--- a/notebooks/features/classification/Classification - Before and After SynapseML.ipynb
+++ b/docs/Explore Algorithms/Classification/Quickstart - SparkML vs SynapseML.ipynb
@@ -45,18 +45,6 @@
"Import necessary Python libraries and get a spark session."
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()"
- ]
- },
{
"attachments": {},
"cell_type": "markdown",
diff --git a/notebooks/features/classification/Classification - Adult Census.ipynb b/docs/Explore Algorithms/Classification/Quickstart - Train Classifier.ipynb
similarity index 88%
rename from notebooks/features/classification/Classification - Adult Census.ipynb
rename to docs/Explore Algorithms/Classification/Quickstart - Train Classifier.ipynb
index 04db727924..6408b13a52 100644
--- a/notebooks/features/classification/Classification - Adult Census.ipynb
+++ b/docs/Explore Algorithms/Classification/Quickstart - Train Classifier.ipynb
@@ -11,28 +11,6 @@
],
"metadata": {}
},
- {
- "cell_type": "code",
- "execution_count": null,
- "source": [
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()"
- ],
- "outputs": [],
- "metadata": {}
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "source": [
- "import numpy as np\n",
- "import pandas as pd"
- ],
- "outputs": [],
- "metadata": {}
- },
{
"cell_type": "markdown",
"source": [
diff --git a/notebooks/features/classification/Classification - Adult Census with Vowpal Wabbit.ipynb b/docs/Explore Algorithms/Classification/Quickstart - Vowpal Wabbit on Tabular Data.ipynb
similarity index 94%
rename from notebooks/features/classification/Classification - Adult Census with Vowpal Wabbit.ipynb
rename to docs/Explore Algorithms/Classification/Quickstart - Vowpal Wabbit on Tabular Data.ipynb
index 4dab04ecaf..52b70dec61 100644
--- a/notebooks/features/classification/Classification - Adult Census with Vowpal Wabbit.ipynb
+++ b/docs/Explore Algorithms/Classification/Quickstart - Vowpal Wabbit on Tabular Data.ipynb
@@ -11,18 +11,6 @@
")."
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()"
- ]
- },
{
"cell_type": "code",
"execution_count": null,
diff --git a/notebooks/features/classification/Classification - Twitter Sentiment with Vowpal Wabbit.ipynb b/docs/Explore Algorithms/Classification/Quickstart - Vowpal Wabbit on Text Data.ipynb
similarity index 96%
rename from notebooks/features/classification/Classification - Twitter Sentiment with Vowpal Wabbit.ipynb
rename to docs/Explore Algorithms/Classification/Quickstart - Vowpal Wabbit on Text Data.ipynb
index 83bac1450c..e85b6e1dad 100644
--- a/notebooks/features/classification/Classification - Twitter Sentiment with Vowpal Wabbit.ipynb
+++ b/docs/Explore Algorithms/Classification/Quickstart - Vowpal Wabbit on Text Data.ipynb
@@ -16,14 +16,10 @@
"outputs": [],
"source": [
"import os\n",
- "import re\n",
"import urllib.request\n",
- "import numpy as np\n",
"import pandas as pd\n",
"from zipfile import ZipFile\n",
- "from bs4 import BeautifulSoup\n",
"from pyspark.sql.functions import udf, rand, when, col\n",
- "from pyspark.sql.types import StructType, StructField, DoubleType, StringType\n",
"from pyspark.ml import Pipeline\n",
"from pyspark.ml.feature import CountVectorizer, RegexTokenizer\n",
"from synapse.ml.vw import VowpalWabbitClassifier\n",
@@ -32,18 +28,6 @@
"import matplotlib.pyplot as plt"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()"
- ]
- },
{
"cell_type": "code",
"execution_count": 3,
diff --git a/website/docs/features/simple_deep_learning/about.md b/docs/Explore Algorithms/Deep Learning/Distributed Training.md
similarity index 95%
rename from website/docs/features/simple_deep_learning/about.md
rename to docs/Explore Algorithms/Deep Learning/Distributed Training.md
index 4ede10d040..46b4bfa597 100644
--- a/website/docs/features/simple_deep_learning/about.md
+++ b/docs/Explore Algorithms/Deep Learning/Distributed Training.md
@@ -1,6 +1,6 @@
---
-title: Simple Deep Learning with SynapseML
-sidebar_label: About
+title: Distributed Training
+sidebar_label: Distributed Training
---
### Why Simple Deep Learning
@@ -72,5 +72,5 @@ pred_df = deep_vision_model.transform(test_df)
```
## Examples
-- [DeepLearning - Deep Vision Classification](../DeepLearning%20-%20Deep%20Vision%20Classification)
-- [DeepLearning - Deep Text Classification](../DeepLearning%20-%20Deep%20Text%20Classification)
+- [Quickstart - Fine-tune a Text Classifier](../Quickstart%20-%20Fine-tune%20a%20Text%20Classifier)
+- [Quickstart - Fine-Tune a Vision Classifier](../Quickstart%20-%20Fine-Tune%20a%20Vision%20Classifier)
diff --git a/website/docs/features/simple_deep_learning/installation.md b/docs/Explore Algorithms/Deep Learning/Getting Started.md
similarity index 81%
rename from website/docs/features/simple_deep_learning/installation.md
rename to docs/Explore Algorithms/Deep Learning/Getting Started.md
index e373d1722c..20e33d6517 100644
--- a/website/docs/features/simple_deep_learning/installation.md
+++ b/docs/Explore Algorithms/Deep Learning/Getting Started.md
@@ -1,6 +1,6 @@
---
-title: Installation Guidance
-sidebar_label: Installation Guidance for Deep Vision Classification
+title: Getting Started
+sidebar_label: Getting Started
---
:::note
@@ -30,13 +30,13 @@ Coordinate: com.microsoft.azure:synapseml_2.12:0.11.2
Repository: https://mmlspark.azureedge.net/maven
```
:::note
-If you install the jar package, follow the first two cells of this [sample](./DeepLearning%20-%20Deep%20Vision%20Classification.md/#environment-setup----reinstall-horovod-based-on-new-version-of-pytorch)
+If you install the jar package, follow the first two cells of this [sample](../Quickstart%20-%20Fine-Tune%20a%20Vision%20Classifier#environment-setup----reinstall-horovod-based-on-new-version-of-pytorch)
to ensure horovod recognizes SynapseML.
:::
## 3. Try our sample notebook
-You could follow the rest of this [sample](./DeepLearning%20-%20Deep%20Vision%20Classification.md) and have a try on your own dataset.
+You could follow the rest of this [sample](../Quickstart%20-%20Fine-Tune a Vision Classifier) and have a try on your own dataset.
Supported models (`backbone` parameter for `DeepVisionClassifer`) should be string format of [Torchvision-supported models](https://github.com/pytorch/vision/blob/v0.12.0/torchvision/models/__init__.py);
You could also check by running `backbone in torchvision.models.__dict__`.
diff --git a/website/versioned_docs/version-0.11.0/features/onnx/about.md b/docs/Explore Algorithms/Deep Learning/ONNX.md
similarity index 97%
rename from website/versioned_docs/version-0.11.0/features/onnx/about.md
rename to docs/Explore Algorithms/Deep Learning/ONNX.md
index baec0d8e6c..8f844caaba 100644
--- a/website/versioned_docs/version-0.11.0/features/onnx/about.md
+++ b/docs/Explore Algorithms/Deep Learning/ONNX.md
@@ -1,7 +1,7 @@
---
-title: ONNX model inferencing on Spark
+title: ONNX
hide_title: true
-sidebar_label: About
+sidebar_label: ONNX
description: Learn how to use the ONNX model transformer to run inference for an ONNX model on Spark.
---
@@ -104,5 +104,5 @@ The below example shows how to perform the slicing manually with a direct ONNXMo
## Example
-- [Interpretability - Image Explainers](../../responsible_ai/Interpretability%20-%20Image%20Explainers)
-- [ONNX - Inference on Spark](../ONNX%20-%20Inference%20on%20Spark)
+- [Image Explainers](../../Responsible%20AI/Image%20Explainers)
+- [Quickstart - ONNX Model Inference](../Quickstart%20-%20ONNX%20Model%20Inference)
diff --git a/notebooks/features/simple_deep_learning/DeepLearning - Deep Text Classification.ipynb b/docs/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Text Classifier.ipynb
similarity index 94%
rename from notebooks/features/simple_deep_learning/DeepLearning - Deep Text Classification.ipynb
rename to docs/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Text Classifier.ipynb
index 5430efad48..f730d14b58 100644
--- a/notebooks/features/simple_deep_learning/DeepLearning - Deep Text Classification.ipynb
+++ b/docs/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Text Classifier.ipynb
@@ -16,34 +16,24 @@
},
{
"cell_type": "markdown",
- "metadata": {
- "application/vnd.databricks.v1+cell": {
- "inputWidgets": {},
- "nuid": "910eee89-ded8-4c36-90ae-e9b8539c5773",
- "showTitle": false,
- "title": ""
- }
- },
"source": [
"### Environment Setup on databricks"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {
- "application/vnd.databricks.v1+cell": {
- "inputWidgets": {},
- "nuid": "60a84fca-38ae-48dc-826a-1cc2011c3977",
- "showTitle": false,
- "title": ""
- }
- },
"outputs": [],
"source": [
"# install cloudpickle 2.0.0 to add synapse module for usage of horovod\n",
"%pip install cloudpickle==2.0.0 --force-reinstall --no-deps"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
diff --git a/notebooks/features/simple_deep_learning/DeepLearning - Deep Vision Classification.ipynb b/docs/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Vision Classifier.ipynb
similarity index 100%
rename from notebooks/features/simple_deep_learning/DeepLearning - Deep Vision Classification.ipynb
rename to docs/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Vision Classifier.ipynb
diff --git a/notebooks/features/onnx/ONNX - Inference on Spark.ipynb b/docs/Explore Algorithms/Deep Learning/Quickstart - ONNX Model Inference.ipynb
similarity index 91%
rename from notebooks/features/onnx/ONNX - Inference on Spark.ipynb
rename to docs/Explore Algorithms/Deep Learning/Quickstart - ONNX Model Inference.ipynb
index bc103e5a4a..866822e1fa 100644
--- a/notebooks/features/onnx/ONNX - Inference on Spark.ipynb
+++ b/docs/Explore Algorithms/Deep Learning/Quickstart - ONNX Model Inference.ipynb
@@ -1,9 +1,7 @@
{
"cells": [
{
- "attachments": {},
"cell_type": "markdown",
- "metadata": {},
"source": [
"# ONNX Inference on Spark\n",
"\n",
@@ -13,42 +11,36 @@
"\n",
"- `onnxmltools==1.7.0`\n",
"- `lightgbm==3.2.1`\n"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
- "attachments": {},
"cell_type": "markdown",
- "metadata": {},
"source": [
"## Load the example data\n",
"\n",
"To load the example data, add the following code examples to cells in your notebook and then run the cells:"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
"outputs": [],
"source": [
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
- "from synapse.ml.core.platform import *\n",
- "\n",
- "from synapse.ml.core.platform import materializing_display as display"
- ]
+ "%pip install lightgbm"
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
"outputs": [],
"source": [
"df = (\n",
@@ -61,12 +53,13 @@
")\n",
"\n",
"display(df)"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
- "attachments": {},
"cell_type": "markdown",
- "metadata": {},
"source": [
"The output should look similar to the following table, though the values and number of rows may differ:\n",
"\n",
@@ -75,20 +68,23 @@
"| 0.5641 | 1.0 | 0.0165 |\n",
"| 0.5702 | 1.0 | 0.0208 |\n",
"| 0.5673 | 1.0 | 0.0165 |"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
- "attachments": {},
"cell_type": "markdown",
- "metadata": {},
"source": [
"## Use LightGBM to train a model"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
"outputs": [],
"source": [
"from pyspark.ml.feature import VectorAssembler\n",
@@ -118,34 +114,35 @@
")\n",
"\n",
"model = model.fit(train_data)"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
- "attachments": {},
"cell_type": "markdown",
- "metadata": {},
"source": [
"## Convert the model to ONNX format\n",
"\n",
"The following code exports the trained model to a LightGBM booster and then converts it to ONNX format:"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {
- "tags": [
- "hide-synapse-internal"
- ]
- },
"outputs": [],
"source": [
"from synapse.ml.core.platform import running_on_binder\n",
"\n",
"if running_on_binder():\n",
- " !pip install lightgbm==3.2.1\n",
" from IPython import get_ipython"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
diff --git a/notebooks/features/other/DeepLearning - Flower Image Classification.ipynb b/docs/Explore Algorithms/Deep Learning/Quickstart - Transfer Learn for Image Classification.ipynb
similarity index 92%
rename from notebooks/features/other/DeepLearning - Flower Image Classification.ipynb
rename to docs/Explore Algorithms/Deep Learning/Quickstart - Transfer Learn for Image Classification.ipynb
index 8d12174e04..d13f9b67df 100644
--- a/notebooks/features/other/DeepLearning - Flower Image Classification.ipynb
+++ b/docs/Explore Algorithms/Deep Learning/Quickstart - Transfer Learn for Image Classification.ipynb
@@ -15,15 +15,7 @@
"source": [
"from pyspark.ml import Transformer, Estimator, Pipeline\n",
"from pyspark.ml.classification import LogisticRegression\n",
- "import sys, time\n",
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
- "from synapse.ml.core.platform import running_on_synapse, running_on_databricks\n",
- "\n",
- "from synapse.ml.core.platform import materializing_display as display"
+ "import sys, time"
]
},
{
@@ -111,7 +103,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "### How does it work?\n\n![Convolutional network weights](http://i.stack.imgur.com/Hl2H6.png)"
+ "### How does it work?\n",
+ "\n",
+ "![Convolutional network weights](http://i.stack.imgur.com/Hl2H6.png)"
]
},
{
diff --git a/notebooks/features/hyperparameter_tuning/HyperOpt-SynapseML.ipynb b/docs/Explore Algorithms/Hyperparameter Tuning/HyperOpt.ipynb
similarity index 95%
rename from notebooks/features/hyperparameter_tuning/HyperOpt-SynapseML.ipynb
rename to docs/Explore Algorithms/Hyperparameter Tuning/HyperOpt.ipynb
index 50910ae895..6616908a00 100644
--- a/notebooks/features/hyperparameter_tuning/HyperOpt-SynapseML.ipynb
+++ b/docs/Explore Algorithms/Hyperparameter Tuning/HyperOpt.ipynb
@@ -31,10 +31,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# %pip install hyperopt\n",
- "import os\n",
- "\n",
- "os.system(\"pip install hyperopt\")"
+ "%pip install hyperopt mlflow"
]
},
{
@@ -47,17 +44,6 @@
"To track model training and tuning with MLflow, you could enable MLflow autologging by running `mlflow.pyspark.ml.autolog()`."
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# version >= 1.28.0 supports reading logModelAllowlistFile from url\n",
- "# %pip install mlflow==1.29.0\n",
- "os.system(\"pip install mlflow==1.29.0\")"
- ]
- },
{
"cell_type": "code",
"execution_count": null,
@@ -65,18 +51,11 @@
"outputs": [],
"source": [
"from synapse.ml.core.platform import *\n",
- "from pyspark.sql import SparkSession\n",
- "\n",
- "spark = SparkSession.builder.getOrCreate()\n",
"\n",
"if running_on_synapse_internal():\n",
" experiment_name = \"hyperopt-synapseml\"\n",
"elif running_on_synapse():\n",
" experiment_name = \"hyperopt-synapseml\"\n",
- " # from notebookutils.visualization import display # use this display on interactive notebook\n",
- " from synapse.ml.core.platform import (\n",
- " materializing_display as display,\n",
- " ) # display for pipeline testing\n",
"else:\n",
" experiment_name = \"/Shared/hyperopt-synapseml\""
]
@@ -517,4 +496,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
-}
\ No newline at end of file
+}
diff --git a/notebooks/features/other/HyperParameterTuning - Fighting Breast Cancer.ipynb b/docs/Explore Algorithms/Hyperparameter Tuning/Quickstart - Random Search.ipynb
similarity index 94%
rename from notebooks/features/other/HyperParameterTuning - Fighting Breast Cancer.ipynb
rename to docs/Explore Algorithms/Hyperparameter Tuning/Quickstart - Random Search.ipynb
index a4b178b1b0..a4e297f65d 100644
--- a/notebooks/features/other/HyperParameterTuning - Fighting Breast Cancer.ipynb
+++ b/docs/Explore Algorithms/Hyperparameter Tuning/Quickstart - Random Search.ipynb
@@ -19,19 +19,6 @@
"Start by importing pandas and setting up our Spark session."
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()"
- ]
- },
{
"attachments": {},
"cell_type": "markdown",
diff --git a/website/versioned_docs/version-0.11.2/features/lightgbm/about.md b/docs/Explore Algorithms/LightGBM/Overview.md
similarity index 99%
rename from website/versioned_docs/version-0.11.2/features/lightgbm/about.md
rename to docs/Explore Algorithms/LightGBM/Overview.md
index bed6b7b8ee..f5979e1072 100644
--- a/website/versioned_docs/version-0.11.2/features/lightgbm/about.md
+++ b/docs/Explore Algorithms/LightGBM/Overview.md
@@ -1,7 +1,7 @@
---
-title: LightGBM
+title: Overview
hide_title: true
-sidebar_label: About
+sidebar_label: Overview
---
# LightGBM on Apache Spark
@@ -55,7 +55,7 @@ model = LightGBMRegressor(application='quantile',
```
For an end to end application, check out the LightGBM [notebook
-example](../LightGBM%20-%20Overview).
+example](../Quickstart%20-%20Classification,%20Ranking,%20and%20Regression).
### Arguments/Parameters
diff --git a/notebooks/features/lightgbm/LightGBM - Overview.ipynb b/docs/Explore Algorithms/LightGBM/Quickstart - Classification, Ranking, and Regression.ipynb
similarity index 97%
rename from notebooks/features/lightgbm/LightGBM - Overview.ipynb
rename to docs/Explore Algorithms/LightGBM/Quickstart - Classification, Ranking, and Regression.ipynb
index 8d123e6453..580bf9373a 100644
--- a/notebooks/features/lightgbm/LightGBM - Overview.ipynb
+++ b/docs/Explore Algorithms/LightGBM/Quickstart - Classification, Ranking, and Regression.ipynb
@@ -65,14 +65,7 @@
"metadata": {},
"outputs": [],
"source": [
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
- "from synapse.ml.core.platform import *\n",
- "\n",
- "from synapse.ml.core.platform import materializing_display as display"
+ "from synapse.ml.core.platform import *"
]
},
{
diff --git a/notebooks/features/cognitive_services/CognitiveServices - LangchainTransformer.ipynb b/docs/Explore Algorithms/OpenAI/Langchain.ipynb
similarity index 95%
rename from notebooks/features/cognitive_services/CognitiveServices - LangchainTransformer.ipynb
rename to docs/Explore Algorithms/OpenAI/Langchain.ipynb
index daeee703b0..1f22e8fa58 100644
--- a/notebooks/features/cognitive_services/CognitiveServices - LangchainTransformer.ipynb
+++ b/docs/Explore Algorithms/OpenAI/Langchain.ipynb
@@ -71,37 +71,6 @@
"1. Connect your notebook to a cluster and follow along, editing and running the cells below."
]
},
- {
- "cell_type": "code",
- "execution_count": 0,
- "metadata": {
- "application/vnd.databricks.v1+cell": {
- "cellMetadata": {
- "byteLimit": 2048000,
- "rowLimit": 10000
- },
- "inputWidgets": {},
- "nuid": "d0642e69-1669-4b18-94a2-258af0fbcf9f",
- "showTitle": false,
- "title": ""
- }
- },
- "outputs": [],
- "source": [
- "# This cell ensures make magic command like '%pip install' works on synapse scheduled spark jobs\n",
- "from synapse.ml.core.platform import running_on_synapse\n",
- "\n",
- "if running_on_synapse():\n",
- " from IPython import get_ipython\n",
- " from IPython.terminal.interactiveshell import TerminalInteractiveShell\n",
- "\n",
- " try:\n",
- " shell = TerminalInteractiveShell.instance()\n",
- " except:\n",
- " pass\n",
- " from notebookutils.visualization import display"
- ]
- },
{
"cell_type": "code",
"execution_count": 0,
@@ -146,7 +115,6 @@
"from langchain.document_loaders import OnlinePDFLoader\n",
"from langchain.tools.bing_search.tool import BingSearchRun, BingSearchAPIWrapper\n",
"from langchain.prompts import PromptTemplate\n",
- "import pyspark.sql.functions as f\n",
"from synapse.ml.cognitive.langchain import LangchainTransformer\n",
"from synapse.ml.core.platform import running_on_synapse, find_secret"
]
diff --git a/notebooks/features/cognitive_services/CognitiveServices - OpenAI.ipynb b/docs/Explore Algorithms/OpenAI/OpenAI.ipynb
similarity index 98%
rename from notebooks/features/cognitive_services/CognitiveServices - OpenAI.ipynb
rename to docs/Explore Algorithms/OpenAI/OpenAI.ipynb
index 98173d1b25..2205407a04 100644
--- a/notebooks/features/cognitive_services/CognitiveServices - OpenAI.ipynb
+++ b/docs/Explore Algorithms/OpenAI/OpenAI.ipynb
@@ -75,15 +75,7 @@
},
"outputs": [],
"source": [
- "import os\n",
- "from pyspark.sql import SparkSession\n",
- "from synapse.ml.core.platform import running_on_synapse, find_secret\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
- "if running_on_synapse():\n",
- " from notebookutils.visualization import display\n",
+ "from synapse.ml.core.platform import find_secret\n",
"\n",
"# Fill in the following lines with your service information\n",
"# Learn more about selecting which embedding model to choose: https://openai.com/blog/new-and-improved-embedding-model\n",
diff --git a/notebooks/features/cognitive_services/CognitiveServices - OpenAI Embedding.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding.ipynb
similarity index 98%
rename from notebooks/features/cognitive_services/CognitiveServices - OpenAI Embedding.ipynb
rename to docs/Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding.ipynb
index 5f40955952..db6ac26237 100644
--- a/notebooks/features/cognitive_services/CognitiveServices - OpenAI Embedding.ipynb
+++ b/docs/Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding.ipynb
@@ -59,15 +59,7 @@
},
"outputs": [],
"source": [
- "import os\n",
- "from pyspark.sql import SparkSession\n",
- "from synapse.ml.core.platform import running_on_synapse, find_secret\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
- "if running_on_synapse():\n",
- " from notebookutils.visualization import display\n",
+ "from synapse.ml.core.platform import find_secret\n",
"\n",
"# Fill in the following lines with your service information\n",
"# Learn more about selecting which embedding model to choose: https://openai.com/blog/new-and-improved-embedding-model\n",
diff --git a/notebooks/features/cognitive_services/CognitiveServices - Create a Multilingual Search Engine from Forms.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Understand and Search Forms.ipynb
similarity index 97%
rename from notebooks/features/cognitive_services/CognitiveServices - Create a Multilingual Search Engine from Forms.ipynb
rename to docs/Explore Algorithms/OpenAI/Quickstart - Understand and Search Forms.ipynb
index e1bb23cf6e..e397a382b8 100644
--- a/notebooks/features/cognitive_services/CognitiveServices - Create a Multilingual Search Engine from Forms.ipynb
+++ b/docs/Explore Algorithms/OpenAI/Quickstart - Understand and Search Forms.ipynb
@@ -48,20 +48,6 @@
"We start by importing packages and connecting to the Azure resources used in this workflow."
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import os\n",
- "from pyspark.sql import SparkSession\n",
- "from synapse.ml.core.platform import running_on_synapse, find_secret\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()"
- ]
- },
{
"cell_type": "code",
"execution_count": null,
@@ -72,12 +58,7 @@
},
"outputs": [],
"source": [
- "if running_on_synapse():\n",
- " from notebookutils.visualization import display\n",
- " import subprocess\n",
- " import sys\n",
- "\n",
- " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"openai\"])"
+ "%pip install openai"
]
},
{
@@ -97,6 +78,8 @@
},
"outputs": [],
"source": [
+ "from synapse.ml.core.platform import find_secret\n",
+ "\n",
"cognitive_key = find_secret(\n",
" \"cognitive-api-key\"\n",
") # Replace the call to find_secret with your key as a python string. e.g. cognitive_key=\"27snaiw...\"\n",
diff --git a/notebooks/features/opencv/OpenCV - Pipeline Image Transformations.ipynb b/docs/Explore Algorithms/OpenCV/Image Transformations.ipynb
similarity index 95%
rename from notebooks/features/opencv/OpenCV - Pipeline Image Transformations.ipynb
rename to docs/Explore Algorithms/OpenCV/Image Transformations.ipynb
index 01e4a7178c..e7b2e799c6 100644
--- a/notebooks/features/opencv/OpenCV - Pipeline Image Transformations.ipynb
+++ b/docs/Explore Algorithms/OpenCV/Image Transformations.ipynb
@@ -26,18 +26,6 @@
"metadata": {},
"outputs": [],
"source": [
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
- "from synapse.ml.core.platform import running_on_synapse\n",
- "\n",
- "if running_on_synapse():\n",
- " from notebookutils.visualization import display\n",
- "\n",
- "import synapse.ml\n",
- "import numpy as np\n",
"from synapse.ml.opencv import toNDArray\n",
"from synapse.ml.io import *\n",
"\n",
diff --git a/website/docs/reference/cyber.md b/docs/Explore Algorithms/Other Algorithms/Cyber ML.md
similarity index 100%
rename from website/docs/reference/cyber.md
rename to docs/Explore Algorithms/Other Algorithms/Cyber ML.md
diff --git a/notebooks/features/other/CyberML - Anomalous Access Detection.ipynb b/docs/Explore Algorithms/Other Algorithms/Quickstart - Anomalous Access Detection.ipynb
similarity index 93%
rename from notebooks/features/other/CyberML - Anomalous Access Detection.ipynb
rename to docs/Explore Algorithms/Other Algorithms/Quickstart - Anomalous Access Detection.ipynb
index 308b62560c..2642d7a9c5 100644
--- a/notebooks/features/other/CyberML - Anomalous Access Detection.ipynb
+++ b/docs/Explore Algorithms/Other Algorithms/Quickstart - Anomalous Access Detection.ipynb
@@ -30,7 +30,6 @@
},
{
"cell_type": "markdown",
- "metadata": {},
"source": [
"# Create an Azure Databricks cluster and install the following libs\n",
"\n",
@@ -39,54 +38,57 @@
"Repository: https://mmlspark.azureedge.net/maven\n",
"\n",
"2. In Cluster Libraries install from PyPI the library called plotly"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "markdown",
- "metadata": {},
"source": [
"# Setup & Initialization"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
"outputs": [],
"source": [
- "# this is used to produce the synthetic dataset for this test\n",
- "from synapse.ml.cyber.dataset import DataFactory\n",
- "\n",
- "# the access anomalies model generator\n",
- "from synapse.ml.cyber.anomaly.collaborative_filtering import AccessAnomaly\n",
- "\n",
- "from pyspark.sql import functions as f, types as t"
- ]
+ "%pip install plotly"
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
"outputs": [],
"source": [
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()"
- ]
+ "# this is used to produce the synthetic dataset for this test\n",
+ "from synapse.ml.cyber.dataset import DataFactory\n",
+ "from synapse.ml.cyber.anomaly.collaborative_filtering import AccessAnomaly\n",
+ "from pyspark.sql import functions as f"
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
- "attachments": {},
"cell_type": "markdown",
- "metadata": {},
"source": [
"# Load up datasets"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
"outputs": [],
"source": [
"from synapse.ml.core.platform import running_on_databricks, running_on_synapse\n",
@@ -96,9 +98,6 @@
"else:\n",
" spark.sparkContext.setCheckpointDir(\"./tmp/checkpoint_path/\")\n",
"\n",
- "if running_on_synapse():\n",
- " from notebookutils.visualization import display\n",
- "\n",
"factory = DataFactory(\n",
" num_hr_users=25,\n",
" num_hr_resources=50,\n",
@@ -119,39 +118,47 @@
"outgroup_df = spark.createDataFrame(\n",
" factory.create_clustered_inter_test_data()\n",
").withColumn(\"tenant_id\", f.lit(0))"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
"outputs": [],
"source": [
"training_df.show()"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
"outputs": [],
"source": [
"print(training_df.count())\n",
"print(ingroup_df.count())\n",
"print(outgroup_df.count())"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "markdown",
- "metadata": {},
"source": [
"# Model setup & training"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
"outputs": [],
"source": [
"access_anomaly = AccessAnomaly(\n",
@@ -161,37 +168,45 @@
" likelihoodCol=\"likelihood\",\n",
" maxIter=1000,\n",
")"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
"outputs": [],
"source": [
"model = access_anomaly.fit(training_df)"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "markdown",
- "metadata": {},
"source": [
"# Apply model & show result stats"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
"outputs": [],
"source": [
"ingroup_scored_df = model.transform(ingroup_df)"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
"outputs": [],
"source": [
"ingroup_scored_df.agg(\n",
@@ -200,21 +215,25 @@
" f.mean(\"anomaly_score\").alias(\"mean_anomaly_score\"),\n",
" f.stddev(\"anomaly_score\").alias(\"stddev_anomaly_score\"),\n",
").show()"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
"outputs": [],
"source": [
"outgroup_scored_df = model.transform(outgroup_df)"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
"outputs": [],
"source": [
"outgroup_scored_df.agg(\n",
@@ -223,19 +242,23 @@
" f.mean(\"anomaly_score\").alias(\"mean_anomaly_score\"),\n",
" f.stddev(\"anomaly_score\").alias(\"stddev_anomaly_score\"),\n",
").show()"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "markdown",
- "metadata": {},
"source": [
"# Examine results"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
"outputs": [],
"source": [
"#\n",
@@ -265,23 +288,19 @@
"results_to_la = results_to_la.withColumn(\"timestamp\", f.current_timestamp())\n",
"\n",
"display(results_to_la)"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "markdown",
- "metadata": {},
"source": [
"# Display all resource accesses by users with highest anomalous score"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# !pip install plotly"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
diff --git a/notebooks/features/other/ConditionalKNN - Exploring Art Across Cultures.ipynb b/docs/Explore Algorithms/Other Algorithms/Quickstart - Exploring Art Across Cultures.ipynb
similarity index 95%
rename from notebooks/features/other/ConditionalKNN - Exploring Art Across Cultures.ipynb
rename to docs/Explore Algorithms/Other Algorithms/Quickstart - Exploring Art Across Cultures.ipynb
index 672a33e6cc..226573fbd6 100644
--- a/notebooks/features/other/ConditionalKNN - Exploring Art Across Cultures.ipynb
+++ b/docs/Explore Algorithms/Other Algorithms/Quickstart - Exploring Art Across Cultures.ipynb
@@ -23,13 +23,14 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
- "metadata": {},
"source": [
"## Overview of the BallTree\n",
"The structure functioning behind the KNN model is a BallTree, which is a recursive binary tree where each node (or \"ball\") contains a partition of the points of data to be queried. Building a BallTree involves assigning data points to the \"ball\" whose center they're closest to (with respect to a certain specified feature), resulting in a structure that allows binary-tree-like traversal and lends itself to finding k-nearest neighbors at a BallTree leaf."
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"attachments": {},
@@ -40,18 +41,6 @@
"Import necessary Python libraries and prepare dataset."
]
},
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "from synapse.ml.core.platform import *\n",
- "\n",
- "if running_on_binder():\n",
- " from IPython import get_ipython"
- ]
- },
{
"cell_type": "code",
"execution_count": 4,
@@ -60,20 +49,13 @@
"source": [
"from pyspark.sql.types import BooleanType\n",
"from pyspark.sql.types import *\n",
- "from pyspark.ml.feature import Normalizer\n",
"from pyspark.sql.functions import lit, array, array_contains, udf, col, struct\n",
"from synapse.ml.nn import ConditionalKNN, ConditionalKNNModel\n",
"from PIL import Image\n",
"from io import BytesIO\n",
- "\n",
"import requests\n",
"import numpy as np\n",
- "import matplotlib.pyplot as plt\n",
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "from synapse.ml.core.platform import materializing_display as display"
+ "import matplotlib.pyplot as plt"
]
},
{
diff --git a/website/docs/reference/SAR.md b/docs/Explore Algorithms/Other Algorithms/Smart Adaptive Recommendations.md
similarity index 100%
rename from website/docs/reference/SAR.md
rename to docs/Explore Algorithms/Other Algorithms/Smart Adaptive Recommendations.md
diff --git a/notebooks/features/regression/Regression - Flight Delays with DataCleaning.ipynb b/docs/Explore Algorithms/Regression/Quickstart - Data Cleaning.ipynb
similarity index 94%
rename from notebooks/features/regression/Regression - Flight Delays with DataCleaning.ipynb
rename to docs/Explore Algorithms/Regression/Quickstart - Data Cleaning.ipynb
index faa604edd0..38b1dc7768 100644
--- a/notebooks/features/regression/Regression - Flight Delays with DataCleaning.ipynb
+++ b/docs/Explore Algorithms/Regression/Quickstart - Data Cleaning.ipynb
@@ -20,30 +20,7 @@
"- [`ComputePerInstanceStatistics`\n",
" ](https://mmlspark.blob.core.windows.net/docs/0.11.2/pyspark/synapse.ml.train.html?#module-synapse.ml.train.ComputePerInstanceStatistics)\n",
"- [`DataConversion`\n",
- " ](https://mmlspark.blob.core.windows.net/docs/0.11.2/pyspark/synapse.ml.featurize.html?#module-synapse.ml.featurize.DataConversion)\n",
- "\n",
- "First, import the pandas package"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd"
+ " ](https://mmlspark.blob.core.windows.net/docs/0.11.2/pyspark/synapse.ml.featurize.html?#module-synapse.ml.featurize.DataConversion)"
]
},
{
diff --git a/notebooks/features/regression/Regression - Auto Imports.ipynb b/docs/Explore Algorithms/Regression/Quickstart - Train Regressor.ipynb
similarity index 97%
rename from notebooks/features/regression/Regression - Auto Imports.ipynb
rename to docs/Explore Algorithms/Regression/Quickstart - Train Regressor.ipynb
index 400fa52875..b47c5cdcd1 100644
--- a/notebooks/features/regression/Regression - Auto Imports.ipynb
+++ b/docs/Explore Algorithms/Regression/Quickstart - Train Regressor.ipynb
@@ -29,18 +29,6 @@
"using `pandas.read_csv()`"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()"
- ]
- },
{
"cell_type": "code",
"execution_count": null,
diff --git a/notebooks/features/regression/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb b/docs/Explore Algorithms/Regression/Quickstart - Vowpal Wabbit and LightGBM.ipynb
similarity index 96%
rename from notebooks/features/regression/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb
rename to docs/Explore Algorithms/Regression/Quickstart - Vowpal Wabbit and LightGBM.ipynb
index 97dd1c59ea..c3c2eebd42 100644
--- a/notebooks/features/regression/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb
+++ b/docs/Explore Algorithms/Regression/Quickstart - Vowpal Wabbit and LightGBM.ipynb
@@ -13,23 +13,6 @@
" [Spark MLlib Linear Regression](https://spark.apache.org/docs/latest/ml-classification-regression.html#linear-regression)."
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
- "from synapse.ml.core.platform import *\n",
- "\n",
- "if running_on_synapse():\n",
- " from synapse.ml.core.platform import materializing_display as display"
- ]
- },
{
"cell_type": "code",
"execution_count": null,
diff --git a/website/docs/features/responsible_ai/Data Balance Analysis.md b/docs/Explore Algorithms/Responsible AI/Data Balance Analysis.md
similarity index 99%
rename from website/docs/features/responsible_ai/Data Balance Analysis.md
rename to docs/Explore Algorithms/Responsible AI/Data Balance Analysis.md
index 07f00ad07b..c8437cc65b 100644
--- a/website/docs/features/responsible_ai/Data Balance Analysis.md
+++ b/docs/Explore Algorithms/Responsible AI/Data Balance Analysis.md
@@ -32,7 +32,7 @@ In summary, Data Balance Analysis, when used as a step for building ML models, h
## Examples
-* [Data Balance Analysis - Adult Census Income](../../../features/responsible_ai/DataBalanceAnalysis%20-%20Adult%20Census%20Income)
+* [Quickstart - Data Balance Analysis](../Quickstart%20-%20Data%20Balance%20Analysis)
## Usage
diff --git a/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb b/docs/Explore Algorithms/Responsible AI/Explanation Dashboard.ipynb
similarity index 78%
rename from notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb
rename to docs/Explore Algorithms/Responsible AI/Explanation Dashboard.ipynb
index c734bb0a4a..f543dc78f5 100644
--- a/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb
+++ b/docs/Explore Algorithms/Responsible AI/Explanation Dashboard.ipynb
@@ -2,38 +2,33 @@
"cells": [
{
"cell_type": "markdown",
- "metadata": {
- "application/vnd.databricks.v1+cell": {
- "inputWidgets": {},
- "nuid": "4a463c67-7543-42d2-a116-e70e8451b09b",
- "showTitle": false,
- "title": ""
- }
- },
"source": [
"## Interpretability - Explanation Dashboard\n",
"\n",
"In this example, similar to the \"Interpretability - Tabular SHAP explainer\" notebook, we use Kernel SHAP to explain a tabular classification model built from the Adults Census dataset and then visualize the explanation in the ExplanationDashboard from https://github.com/microsoft/responsible-ai-widgets.\n",
"\n",
"First we import the packages and define some UDFs we will need later."
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
+ "outputs": [],
+ "source": [
+ "%pip install raiwidgets itsdangerous==2.0.1 interpret-community"
+ ],
"metadata": {
- "application/vnd.databricks.v1+cell": {
- "inputWidgets": {},
- "nuid": "bf0fdfc2-97b2-48e4-b3d9-794b0cb3da67",
- "showTitle": false,
- "title": ""
- },
- "collapsed": true
- },
+ "collapsed": false
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
"outputs": [],
"source": [
- "import pyspark\n",
- "from IPython import get_ipython\n",
"from IPython.terminal.interactiveshell import TerminalInteractiveShell\n",
"from synapse.ml.explainers import *\n",
"from pyspark.ml import Pipeline\n",
@@ -42,48 +37,26 @@
"from pyspark.sql.types import *\n",
"from pyspark.sql.functions import *\n",
"import pandas as pd\n",
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
- "from synapse.ml.core.platform import running_on_synapse\n",
- "\n",
- "if running_on_synapse():\n",
- " shell = TerminalInteractiveShell.instance()\n",
- " from notebookutils.visualization import display\n",
- "\n",
"\n",
"vec_access = udf(lambda v, i: float(v[i]), FloatType())\n",
"vec2array = udf(lambda vec: vec.toArray().tolist(), ArrayType(FloatType()))"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "markdown",
- "metadata": {
- "application/vnd.databricks.v1+cell": {
- "inputWidgets": {},
- "nuid": "ae47e1f9-0672-47ed-94de-10970e1b14b5",
- "showTitle": false,
- "title": ""
- }
- },
"source": [
"Now let's read the data and train a simple binary classification model."
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {
- "application/vnd.databricks.v1+cell": {
- "inputWidgets": {},
- "nuid": "58807448-d8e0-4818-adc8-27536d561fb3",
- "showTitle": false,
- "title": ""
- },
- "collapsed": true
- },
"outputs": [],
"source": [
"df = spark.read.parquet(\n",
@@ -129,68 +102,46 @@
"lr = LogisticRegression(featuresCol=\"features\", labelCol=\"label\", weightCol=\"fnlwgt\")\n",
"pipeline = Pipeline(stages=[strIndexer, onehotEnc, vectAssem, lr])\n",
"model = pipeline.fit(training)"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "markdown",
- "metadata": {
- "application/vnd.databricks.v1+cell": {
- "inputWidgets": {},
- "nuid": "f617f9a4-7e67-43f8-8fa9-92680b635b3d",
- "showTitle": false,
- "title": ""
- }
- },
"source": [
"After the model is trained, we randomly select some observations to be explained."
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {
- "application/vnd.databricks.v1+cell": {
- "inputWidgets": {},
- "nuid": "f55757a6-6204-4f64-a91e-65bfbacf62bc",
- "showTitle": false,
- "title": ""
- },
- "collapsed": true
- },
"outputs": [],
"source": [
"explain_instances = (\n",
" model.transform(training).orderBy(rand()).limit(5).repartition(200).cache()\n",
")\n",
"display(explain_instances)"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "markdown",
- "metadata": {
- "application/vnd.databricks.v1+cell": {
- "inputWidgets": {},
- "nuid": "48a0c8ee-8e36-4bd3-9a04-eded6d2c8894",
- "showTitle": false,
- "title": ""
- }
- },
"source": [
"We create a TabularSHAP explainer, set the input columns to all the features the model takes, specify the model and the target output column we are trying to explain. In this case, we are trying to explain the \"probability\" output which is a vector of length 2, and we are only looking at class 1 probability. Specify targetClasses to `[0, 1]` if you want to explain class 0 and 1 probability at the same time. Finally we sample 100 rows from the training data for background data, which is used for integrating out features in Kernel SHAP."
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {
- "application/vnd.databricks.v1+cell": {
- "inputWidgets": {},
- "nuid": "7e097552-e617-4e1c-a085-b66eca5bcb69",
- "showTitle": false,
- "title": ""
- },
- "collapsed": true
- },
"outputs": [],
"source": [
"shap = TabularSHAP(\n",
@@ -204,36 +155,24 @@
")\n",
"\n",
"shap_df = shap.transform(explain_instances)"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
- "attachments": {},
"cell_type": "markdown",
- "metadata": {
- "application/vnd.databricks.v1+cell": {
- "inputWidgets": {},
- "nuid": "6933b52b-7d46-4210-810a-f984b76dd4a2",
- "showTitle": false,
- "title": ""
- }
- },
"source": [
"Once we have the resulting dataframe, we extract the class 1 probability of the model output, the SHAP values for the target class, the original features and the true label. Then we convert it to a pandas dataframe for visualization.\n",
"For each observation, the first element in the SHAP values vector is the base value (the mean output of the background dataset), and each of the following element is the SHAP values for each feature."
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {
- "application/vnd.databricks.v1+cell": {
- "inputWidgets": {},
- "nuid": "05e01f98-e44c-46c9-a8ae-26ba892f85b3",
- "showTitle": false,
- "title": ""
- },
- "collapsed": true
- },
"outputs": [],
"source": [
"shaps = (\n",
@@ -248,37 +187,25 @@
"shaps_local.sort_values(\"probability\", ascending=False, inplace=True, ignore_index=True)\n",
"pd.set_option(\"display.max_colwidth\", None)\n",
"shaps_local"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "markdown",
- "metadata": {
- "application/vnd.databricks.v1+cell": {
- "inputWidgets": {},
- "nuid": "f9317a27-900a-4d1d-9e9f-9fe906eae75c",
- "showTitle": false,
- "title": ""
- }
- },
"source": [
"We can visualize the explanation in the [interpret-community format](https://github.com/interpretml/interpret-community) in the ExplanationDashboard from https://github.com/microsoft/responsible-ai-widgets/"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {
- "application/vnd.databricks.v1+cell": {
- "inputWidgets": {},
- "nuid": "c9b4c03e-eac8-4314-a6c2-0a451525e6a4",
- "showTitle": false,
- "title": ""
- },
- "collapsed": true
- },
"outputs": [],
"source": [
- "import pandas as pd\n",
"import numpy as np\n",
"\n",
"features = categorical_features + numeric_features\n",
@@ -289,14 +216,14 @@
"local_importance_values = shaps_local[[\"shapValues\"]]\n",
"eval_data = shaps_local[features]\n",
"true_y = np.array(shaps_local[[\"label\"]])"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {
- "collapsed": true
- },
"outputs": [],
"source": [
"list_local_importance_values = local_importance_values.values.tolist()\n",
@@ -309,37 +236,19 @@
" # remove the bias from local importance values\n",
" del converted_list[0]\n",
" converted_importance_values.append(converted_list)"
- ]
+ ],
+ "metadata": {
+ "collapsed": false
+ }
},
{
"cell_type": "markdown",
- "metadata": {},
"source": [
"When running Synapse Analytics, please follow instructions here [Package management - Azure Synapse Analytics | Microsoft Docs](https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-azure-portal-add-libraries) to install [\"raiwidgets\"](https://pypi.org/project/raiwidgets/) and [\"interpret-community\"](https://pypi.org/project/interpret-community/) packages."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
+ ],
"metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "!pip install --upgrade raiwidgets\n",
- "!pip install itsdangerous==2.0.1"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "!pip install --upgrade interpret-community"
- ]
+ "collapsed": false
+ }
},
{
"cell_type": "code",
@@ -480,4 +389,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
-}
\ No newline at end of file
+}
diff --git a/notebooks/features/responsible_ai/Interpretability - Image Explainers.ipynb b/docs/Explore Algorithms/Responsible AI/Image Explainers.ipynb
similarity index 94%
rename from notebooks/features/responsible_ai/Interpretability - Image Explainers.ipynb
rename to docs/Explore Algorithms/Responsible AI/Image Explainers.ipynb
index 26681c9f47..5925ef6e07 100644
--- a/notebooks/features/responsible_ai/Interpretability - Image Explainers.ipynb
+++ b/docs/Explore Algorithms/Responsible AI/Image Explainers.ipynb
@@ -22,24 +22,14 @@
"from synapse.ml.opencv import ImageTransformer\n",
"from synapse.ml.io import *\n",
"from pyspark.ml import Pipeline\n",
- "from pyspark.ml.classification import LogisticRegression\n",
- "from pyspark.ml.feature import StringIndexer\n",
"from pyspark.sql.functions import *\n",
"from pyspark.sql.types import *\n",
"import numpy as np\n",
- "import pyspark\n",
"import urllib.request\n",
"import matplotlib.pyplot as plt\n",
- "import PIL, io\n",
"from PIL import Image\n",
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
"from synapse.ml.core.platform import *\n",
"\n",
- "from synapse.ml.core.platform import materializing_display as display\n",
"\n",
"vec_slice = udf(\n",
" lambda vec, indices: (vec.toArray())[indices].tolist(), ArrayType(FloatType())\n",
@@ -73,10 +63,7 @@
" image_array[y, x, 3] = 200\n",
" plt.clf()\n",
" plt.imshow(image_array)\n",
- " if running_on_synapse() or running_on_synapse_internal():\n",
- " plt.show()\n",
- " else:\n",
- " display(plt)"
+ " plt.show()"
]
},
{
diff --git a/website/versioned_docs/version-0.10.0/features/responsible_ai/Model Interpretation on Spark.md b/docs/Explore Algorithms/Responsible AI/Interpreting Model Predictions.md
similarity index 96%
rename from website/versioned_docs/version-0.10.0/features/responsible_ai/Model Interpretation on Spark.md
rename to docs/Explore Algorithms/Responsible AI/Interpreting Model Predictions.md
index 93dbc54cef..92b3ca745f 100644
--- a/website/versioned_docs/version-0.10.0/features/responsible_ai/Model Interpretation on Spark.md
+++ b/docs/Explore Algorithms/Responsible AI/Interpreting Model Predictions.md
@@ -1,7 +1,7 @@
---
-title: Model Interpretation on Spark
+title: Interpreting Model Predictions
hide_title: true
-sidebar_label: Model Interpretation on Spark
+sidebar_label: Interpreting Model Predictions
---
# Model Interpretation on Spark
@@ -26,9 +26,9 @@ Both explainers extends from `org.apache.spark.ml.Transformer`. After setting up
To see examples of model interpretability on Spark in action, take a look at these sample notebooks:
-- [Tabular SHAP explainer](../../../features/responsible_ai/Interpretability%20-%20Tabular%20SHAP%20explainer)
-- [Image explainers](../../../features/responsible_ai/Interpretability%20-%20Image%20Explainers)
-- [Text explainers](../../../features/responsible_ai/Interpretability%20-%20Text%20Explainers)
+- [Tabular Explainers](../Tabular%20Explainers)
+- [Image Explainers](../Image%20Explainers)
+- [Text Explainers](../Text%20Explainers)
| | Tabular models | Vector models | Image models | Text models |
|------------------------|-----------------------------|---------------------------|-------------------------|-----------------------|
diff --git a/notebooks/features/responsible_ai/Interpretability - PDP and ICE explainer.ipynb b/docs/Explore Algorithms/Responsible AI/PDP and ICE Explainers.ipynb
similarity index 99%
rename from notebooks/features/responsible_ai/Interpretability - PDP and ICE explainer.ipynb
rename to docs/Explore Algorithms/Responsible AI/PDP and ICE Explainers.ipynb
index 12b8a2a05f..4507864250 100644
--- a/notebooks/features/responsible_ai/Interpretability - PDP and ICE explainer.ipynb
+++ b/docs/Explore Algorithms/Responsible AI/PDP and ICE Explainers.ipynb
@@ -78,14 +78,7 @@
"from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
"from synapse.ml.explainers import ICETransformer\n",
"import matplotlib.pyplot as plt\n",
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
- "from synapse.ml.core.platform import *\n",
- "\n",
- "from synapse.ml.core.platform import materializing_display as display"
+ "from synapse.ml.core.platform import *"
]
},
{
diff --git a/notebooks/features/responsible_ai/DataBalanceAnalysis - Adult Census Income.ipynb b/docs/Explore Algorithms/Responsible AI/Quickstart - Data Balance Analysis.ipynb
similarity index 97%
rename from notebooks/features/responsible_ai/DataBalanceAnalysis - Adult Census Income.ipynb
rename to docs/Explore Algorithms/Responsible AI/Quickstart - Data Balance Analysis.ipynb
index e021561b2e..517cde68ca 100644
--- a/notebooks/features/responsible_ai/DataBalanceAnalysis - Adult Census Income.ipynb
+++ b/docs/Explore Algorithms/Responsible AI/Quickstart - Data Balance Analysis.ipynb
@@ -51,18 +51,10 @@
},
"outputs": [],
"source": [
- "import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pyspark.sql.functions as F\n",
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
- "from synapse.ml.core.platform import *\n",
- "\n",
- "from synapse.ml.core.platform import materializing_display as display"
+ "from synapse.ml.core.platform import *"
]
},
{
@@ -179,7 +171,7 @@
}
},
"source": [
- "### [Calculate Feature Balance Measures](/docs/features/responsible_ai/Data%20Balance%20Analysis/)\n",
+ "### [Calculate Feature Balance Measures](../Data%20Balance%20Analysis)\n",
"\n",
"Feature Balance Measures allow us to see whether each combination of sensitive feature is receiving the positive outcome (true prediction) at equal rates.\n",
"\n",
@@ -380,7 +372,7 @@
}
},
"source": [
- "### Calculate [Distribution Balance Measures](/docs/features/responsible_ai/Data%20Balance%20Analysis/)\n",
+ "### Calculate [Distribution Balance Measures](../Data%20Balance%20Analysis)\n",
"\n",
"Distribution Balance Measures allow us to compare our data with a reference distribution (i.e. uniform distribution). They are calculated per sensitive column and don't use the label column. |"
]
@@ -534,7 +526,7 @@
}
},
"source": [
- "### Calculate [Aggregate Balance Measures](/docs/features/responsible_ai/Data%20Balance%20Analysis/)\n",
+ "### Calculate [Aggregate Balance Measures](../Data%20Balance%20Analysis)\n",
"\n",
"Aggregate Balance Measures allow us to obtain a higher notion of inequality. They are calculated on the global set of sensitive columns and don't use the label column.\n",
"\n",
diff --git a/notebooks/features/responsible_ai/Interpretability - Snow Leopard Detection.ipynb b/docs/Explore Algorithms/Responsible AI/Quickstart - Snow Leopard Detection.ipynb
similarity index 97%
rename from notebooks/features/responsible_ai/Interpretability - Snow Leopard Detection.ipynb
rename to docs/Explore Algorithms/Responsible AI/Quickstart - Snow Leopard Detection.ipynb
index 95721aae21..c478c81bea 100644
--- a/notebooks/features/responsible_ai/Interpretability - Snow Leopard Detection.ipynb
+++ b/docs/Explore Algorithms/Responsible AI/Quickstart - Snow Leopard Detection.ipynb
@@ -13,16 +13,8 @@
"cell_type": "code",
"execution_count": null,
"source": [
- "import os\n",
- "from pyspark.sql import SparkSession\n",
"from synapse.ml.core.platform import *\n",
"\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
- "if running_on_synapse():\n",
- " from notebookutils.visualization import display\n",
- "\n",
"bing_search_key = find_secret(\"bing-search-key\")\n",
"\n",
"# WARNING this notebook requires a lot of memory.\n",
diff --git a/notebooks/features/responsible_ai/Interpretability - Tabular SHAP explainer.ipynb b/docs/Explore Algorithms/Responsible AI/Tabular Explainers.ipynb
similarity index 97%
rename from notebooks/features/responsible_ai/Interpretability - Tabular SHAP explainer.ipynb
rename to docs/Explore Algorithms/Responsible AI/Tabular Explainers.ipynb
index 18f053096a..55efc6d590 100644
--- a/notebooks/features/responsible_ai/Interpretability - Tabular SHAP explainer.ipynb
+++ b/docs/Explore Algorithms/Responsible AI/Tabular Explainers.ipynb
@@ -32,7 +32,6 @@
},
"outputs": [],
"source": [
- "import pyspark\n",
"from synapse.ml.explainers import *\n",
"from pyspark.ml import Pipeline\n",
"from pyspark.ml.classification import LogisticRegression\n",
@@ -40,15 +39,8 @@
"from pyspark.sql.types import *\n",
"from pyspark.sql.functions import *\n",
"import pandas as pd\n",
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
"from synapse.ml.core.platform import *\n",
"\n",
- "from synapse.ml.core.platform import materializing_display as display\n",
- "\n",
"\n",
"vec_access = udf(lambda v, i: float(v[i]), FloatType())\n",
"vec2array = udf(lambda vec: vec.toArray().tolist(), ArrayType(FloatType()))"
@@ -309,7 +301,9 @@
"fig.update_yaxes(range=[-1, 1], fixedrange=True, zerolinecolor=\"black\")\n",
"fig.update_xaxes(type=\"category\", tickangle=45, fixedrange=True)\n",
"fig.update_layout(height=400 * rows, title_text=\"SHAP explanations\")\n",
- "fig.show()"
+ "\n",
+ "if not running_on_synapse():\n",
+ " fig.show()"
]
},
{
diff --git a/notebooks/features/responsible_ai/Interpretability - Text Explainers.ipynb b/docs/Explore Algorithms/Responsible AI/Text Explainers.ipynb
similarity index 96%
rename from notebooks/features/responsible_ai/Interpretability - Text Explainers.ipynb
rename to docs/Explore Algorithms/Responsible AI/Text Explainers.ipynb
index ed78e24552..a46ca98f4f 100644
--- a/notebooks/features/responsible_ai/Interpretability - Text Explainers.ipynb
+++ b/docs/Explore Algorithms/Responsible AI/Text Explainers.ipynb
@@ -33,20 +33,12 @@
"source": [
"from pyspark.sql.functions import *\n",
"from pyspark.sql.types import *\n",
- "from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF, Tokenizer\n",
"from pyspark.ml import Pipeline\n",
"from pyspark.ml.classification import LogisticRegression\n",
"from synapse.ml.explainers import *\n",
"from synapse.ml.featurize.text import TextFeaturizer\n",
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
"from synapse.ml.core.platform import *\n",
"\n",
- "from synapse.ml.core.platform import materializing_display as display",
- "\n",
"vec2array = udf(lambda vec: vec.toArray().tolist(), ArrayType(FloatType()))\n",
"vec_access = udf(lambda v, i: float(v[i]), FloatType())"
]
diff --git a/notebooks/features/vw/Vowpal Wabbit - Contextual Bandits.ipynb b/docs/Explore Algorithms/Vowpal Wabbit/Contextual Bandits.ipynb
similarity index 92%
rename from notebooks/features/vw/Vowpal Wabbit - Contextual Bandits.ipynb
rename to docs/Explore Algorithms/Vowpal Wabbit/Contextual Bandits.ipynb
index ea51fa390e..b3616faabe 100644
--- a/notebooks/features/vw/Vowpal Wabbit - Contextual Bandits.ipynb
+++ b/docs/Explore Algorithms/Vowpal Wabbit/Contextual Bandits.ipynb
@@ -22,22 +22,6 @@
"## Step1: Read the dataset"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
- "from synapse.ml.core.platform import *\n",
- "\n",
- "from synapse.ml.core.platform import materializing_display as display"
- ]
- },
{
"cell_type": "code",
"execution_count": null,
diff --git a/notebooks/features/vw/Vowpal Wabbit - Multi-class classification.ipynb b/docs/Explore Algorithms/Vowpal Wabbit/Multi-class classification.ipynb
similarity index 91%
rename from notebooks/features/vw/Vowpal Wabbit - Multi-class classification.ipynb
rename to docs/Explore Algorithms/Vowpal Wabbit/Multi-class classification.ipynb
index f5ce7ebbeb..08599b34a2 100644
--- a/notebooks/features/vw/Vowpal Wabbit - Multi-class classification.ipynb
+++ b/docs/Explore Algorithms/Vowpal Wabbit/Multi-class classification.ipynb
@@ -16,22 +16,6 @@
"#### Read dataset"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
- "from synapse.ml.core.platform import *\n",
- "\n",
- "from synapse.ml.core.platform import materializing_display as display"
- ]
- },
{
"cell_type": "code",
"execution_count": null,
diff --git a/website/versioned_docs/version-0.10.0/features/vw/about.md b/docs/Explore Algorithms/Vowpal Wabbit/Overview.md
similarity index 97%
rename from website/versioned_docs/version-0.10.0/features/vw/about.md
rename to docs/Explore Algorithms/Vowpal Wabbit/Overview.md
index ac0f56ff2f..eda047ca0b 100644
--- a/website/versioned_docs/version-0.10.0/features/vw/about.md
+++ b/docs/Explore Algorithms/Vowpal Wabbit/Overview.md
@@ -64,7 +64,7 @@ model = (VowpalWabbitRegressor(args="--holdout_off --loss_function quantile -q :
You can pass command line parameters to VW via the args parameter, as documented in the [VW Wiki](https://github.com/vowpalWabbit/vowpal_wabbit/wiki/Command-Line-Arguments).
For an end to end application, check out the VowpalWabbit [notebook
-example](../Vowpal%20Wabbit%20-%20Overview).
+example](../Quickstart%20-%20Classification,%20Quantile%20Regression,%20and%20Regression).
### Hyper-parameter tuning
@@ -87,7 +87,7 @@ VowpalWabbit on Spark uses an optimized JNI layer to efficiently support Spark.
Java bindings can be found in the [VW GitHub repo](https://github.com/VowpalWabbit/vowpal_wabbit/blob/master/java/src/main/c%2B%2B/jni_spark_vw_generated.h).
VW's command line tool uses a two-thread architecture (1x parsing/hashing, 1x learning) for learning and inference.
-To fluently embed VW into the Spark ML eco system, the following adaptions were made:
+To fluently embed VW into the Spark ML ecosystem, the following adaptions were made:
- VW classifier/regressor operates on Spark's dense/sparse vectors
- Pro: best composability with existing Spark ML components.
diff --git a/notebooks/features/vw/Vowpal Wabbit - Classification using SparkML Vector.ipynb b/docs/Explore Algorithms/Vowpal Wabbit/Quickstart - Classification using SparkML Vectors.ipynb
similarity index 90%
rename from notebooks/features/vw/Vowpal Wabbit - Classification using SparkML Vector.ipynb
rename to docs/Explore Algorithms/Vowpal Wabbit/Quickstart - Classification using SparkML Vectors.ipynb
index ce1475083c..b08913b78c 100644
--- a/notebooks/features/vw/Vowpal Wabbit - Classification using SparkML Vector.ipynb
+++ b/docs/Explore Algorithms/Vowpal Wabbit/Quickstart - Classification using SparkML Vectors.ipynb
@@ -23,22 +23,6 @@
"#### Read dataset"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
- "from synapse.ml.core.platform import *\n",
- "\n",
- "from synapse.ml.core.platform import materializing_display as display"
- ]
- },
{
"cell_type": "code",
"execution_count": null,
@@ -46,7 +30,6 @@
"outputs": [],
"source": [
"import pyspark.sql.types as T\n",
- "from pyspark.sql import functions as F\n",
"\n",
"schema = T.StructType(\n",
" [\n",
diff --git a/notebooks/features/vw/Vowpal Wabbit - Classification using VW-native Format.ipynb b/docs/Explore Algorithms/Vowpal Wabbit/Quickstart - Classification using VW-native Format.ipynb
similarity index 91%
rename from notebooks/features/vw/Vowpal Wabbit - Classification using VW-native Format.ipynb
rename to docs/Explore Algorithms/Vowpal Wabbit/Quickstart - Classification using VW-native Format.ipynb
index 38aae4b605..a88965eeac 100644
--- a/notebooks/features/vw/Vowpal Wabbit - Classification using VW-native Format.ipynb
+++ b/docs/Explore Algorithms/Vowpal Wabbit/Quickstart - Classification using VW-native Format.ipynb
@@ -23,22 +23,6 @@
"#### Read dataset"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
- "from synapse.ml.core.platform import *\n",
- "\n",
- "from synapse.ml.core.platform import materializing_display as display"
- ]
- },
{
"cell_type": "code",
"execution_count": null,
diff --git a/notebooks/features/vw/Vowpal Wabbit - Overview.ipynb b/docs/Explore Algorithms/Vowpal Wabbit/Quickstart - Classification, Quantile Regression, and Regression.ipynb
similarity index 98%
rename from notebooks/features/vw/Vowpal Wabbit - Overview.ipynb
rename to docs/Explore Algorithms/Vowpal Wabbit/Quickstart - Classification, Quantile Regression, and Regression.ipynb
index 66c5258fe6..d60d70fb97 100644
--- a/notebooks/features/vw/Vowpal Wabbit - Overview.ipynb
+++ b/docs/Explore Algorithms/Vowpal Wabbit/Quickstart - Classification, Quantile Regression, and Regression.ipynb
@@ -85,23 +85,6 @@
"#### Read dataset"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from pyspark.sql import SparkSession\n",
- "\n",
- "# Bootstrap Spark Session\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
- "from synapse.ml.core.platform import *\n",
- "\n",
- "if running_on_synapse():\n",
- " from synapse.ml.core.platform import materializing_display as display"
- ]
- },
{
"cell_type": "code",
"execution_count": null,
diff --git a/notebooks/documentation/prerequisites_platforms.ipynb b/docs/Get Started/Create a Spark Cluster.ipynb
similarity index 100%
rename from notebooks/documentation/prerequisites_platforms.ipynb
rename to docs/Get Started/Create a Spark Cluster.ipynb
diff --git a/website/docs/getting_started/installation.md b/docs/Get Started/Install SynapseML.md
similarity index 94%
rename from website/docs/getting_started/installation.md
rename to docs/Get Started/Install SynapseML.md
index fb977d482a..6870207c39 100644
--- a/website/docs/getting_started/installation.md
+++ b/docs/Get Started/Install SynapseML.md
@@ -1,6 +1,6 @@
---
-title: Installation
-description: Getting started with SynapseML
+title: Install SynapseML
+description: Install SynapseML
---
## Synapse
@@ -180,7 +180,7 @@ docker run -it -p 8888:8888 -e ACCEPT_EULA=yes mcr.microsoft.com/mmlspark/releas
```
Navigate to in your web browser to run the sample
-notebooks. See the [documentation](reference/docker.md) for more on Docker use.
+notebooks. See the [documentation](../../Reference/Docker Setup.md) for more on Docker use.
> To read the EULA for using the docker image, run
``` bash
@@ -191,21 +191,21 @@ docker run -it -p 8888:8888 mcr.microsoft.com/mmlspark/release eula
## Building from source
SynapseML has recently transitioned to a new build infrastructure.
-For detailed developer docs, see the [Developer Readme](reference/developer-readme.md)
+For detailed developer docs, see the [Developer Readme](../../Reference/Docker%20Setup)
If you're an existing SynapseML developer, you'll need to reconfigure your
development setup. We now support platform independent development and
better integrate with intellij and SBT.
- If you encounter issues, reach out to our support email!
+If you encounter issues, reach out to our support email!
## R (Beta)
To try out SynapseML using the R autogenerated wrappers, [see our
-instructions](reference/R-setup.md). Note: This feature is still under development
+instructions](../../Reference/R%20Setup). Note: This feature is still under development
and some necessary custom wrappers may be missing.
## C# (.NET)
-To try out SynapseML with .NET, follow the [.NET Installation Guide](reference/dotnet-setup.md).
+To try out SynapseML with .NET, follow the [.NET Installation Guide](../../Reference/Dotnet%20Setup).
Note: Some stages including AzureSearchWriter, DiagnosticInfo, UDPyF Param, ParamSpaceParam, BallTreeParam,
ConditionalBallTreeParam, LightGBMBooster Param are still under development and not exposed in .NET.
diff --git a/notebooks/features/classification/Classification - Sentiment Analysis Quickstart.ipynb b/docs/Get Started/Quickstart - Your First Models.ipynb
similarity index 86%
rename from notebooks/features/classification/Classification - Sentiment Analysis Quickstart.ipynb
rename to docs/Get Started/Quickstart - Your First Models.ipynb
index ef3416f258..4c339dc5f6 100644
--- a/notebooks/features/classification/Classification - Sentiment Analysis Quickstart.ipynb
+++ b/docs/Get Started/Quickstart - Your First Models.ipynb
@@ -11,43 +11,10 @@
}
},
"source": [
- "# Build your first SynapseML model\n",
+ "# Build your first SynapseML models\n",
"This tutorial provides a brief introduction to SynapseML. In particular, we use SynapseML to create two different pipelines for sentiment analysis. The first pipeline combines a text featurization stage with LightGBM regression to predict ratings based on review text from a dataset containing book reviews from Amazon. The second pipeline shows how to use prebuilt models through the Azure Cognitive Services to solve this problem without training data."
]
},
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Set up the environment\n",
- "Import SynapseML libraries and initialize your Spark session."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "jupyter": {
- "outputs_hidden": false,
- "source_hidden": false
- },
- "nteract": {
- "transient": {
- "deleting": false
- }
- }
- },
- "outputs": [],
- "source": [
- "from pyspark.sql import SparkSession\n",
- "from synapse.ml.core.platform import *\n",
- "\n",
- "spark = SparkSession.builder.getOrCreate()\n",
- "\n",
- "from synapse.ml.core.platform import materializing_display as display"
- ]
- },
{
"attachments": {},
"cell_type": "markdown",
diff --git a/notebooks/documentation/prerequisites_azure_resources.ipynb b/docs/Get Started/Set up Cognitive Services.ipynb
similarity index 83%
rename from notebooks/documentation/prerequisites_azure_resources.ipynb
rename to docs/Get Started/Set up Cognitive Services.ipynb
index 3fa03ebdd5..f196249fe4 100644
--- a/notebooks/documentation/prerequisites_azure_resources.ipynb
+++ b/docs/Get Started/Set up Cognitive Services.ipynb
@@ -6,9 +6,7 @@
"source": [
"# Setting up Cognitive Services and Azure OpenAI resources for SynapseML \n",
"\n",
- "In order to use SynapseML's OpenAI or Cognitive Services features, specific Azure resources are required. This documentation walks you through the process of setting up these resources and acquiring the necessary credentials.\n",
- "\n",
- "To utilize SynapseML's OpenAI or Cognitive Services features, you need to create resources and obtain the API keys. "
+ "In order to use SynapseML's OpenAI or Cognitive Services features, specific Azure resources are required. This documentation walks you through the process of setting up these resources and acquiring the necessary credentials."
]
},
{
@@ -19,7 +17,7 @@
]
},
"source": [
- "You need a valid Azure subscription to create resources.\n",
+ "First, create an Azure subscription to create resources.\n",
"* A valid Azure subscription - [Create one for free](https://azure.microsoft.com/free/cognitive-services/)."
]
},
@@ -27,7 +25,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Azure OpenAI resource\n",
+ "## Azure OpenAI\n",
"\n",
"The [Azure OpenAI service](https://azure.microsoft.com/products/cognitive-services/openai-service/) can be used to solve a large number of natural language tasks through prompting the completion API. To make it easier to scale your prompting workflows from a few examples to large datasets of examples, we have integrated the Azure OpenAI service with the distributed machine learning library SynapseML. This integration makes it easy to use the Apache Spark distributed computing framework to process millions of prompts with the OpenAI service."
]
@@ -50,7 +48,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Cognitive Services resource"
+ "## Cognitive Services"
]
},
{
@@ -64,7 +62,7 @@
"To set up [Cognitive Services](https://azure.microsoft.com/products/cognitive-services/) for use with SynapseML you first need to:\n",
"* [Assign yourself the Cognitive Services Contributor role](https://learn.microsoft.com/azure/role-based-access-control/role-assignments-steps) to agree to the responsible AI terms and create a resource. \n",
"* [Create an Azure Cognitive multi-service (Decision, Language, Speech, Vision) resource](https://portal.azure.com/#create/Microsoft.CognitiveServicesAllInOne). Alternatively, you can follow the steps to [create Single-service resource](https://learn.microsoft.com/en-us/azure/cognitive-services/cognitive-services-apis-create-account?tabs=decision%2Canomaly-detector%2Clanguage-service%2Ccomputer-vision%2Cwindows#create-a-new-azure-cognitive-services-resource). \n",
-"* Get your Cognitive Service resource's key. After your resource is successfully deployed, select **Next Steps** > **Go to resource**. Once at the resource, you can get the key from **Resource Management** > **Keys and Endpoint**. Copy the key and paste it into the notebook. Store keys securely and do not share them. "
+ "* Get your Cognitive Service resource's key. After your resource is successfully deployed, select **Next Steps** > **Go to resource**. Once at the resource, you can get the key from **Resource Management** > **Keys and Endpoint**. Copy the key and paste it into the notebook. Store keys securely and do not share them. "
]
}
],
diff --git a/docs/Overview.md b/docs/Overview.md
new file mode 100644
index 0000000000..c5a4a615d9
--- /dev/null
+++ b/docs/Overview.md
@@ -0,0 +1,29 @@
+---
+title: What is SynapseML?
+sidebar_label: What is SynapseML?
+hide_title: true
+---
+
+import useBaseUrl from "@docusaurus/useBaseUrl";
+
+# What is SynapseML?
+
+SynapseML (previously known as MMLSpark), is an open-source library that simplifies the creation of massively scalable machine learning (ML) pipelines. SynapseML provides simple, composable, and distributed APIs for a wide variety of different machine learning tasks such as text analytics, vision, anomaly detection, and many others. SynapseML is built on the [Apache Spark distributed computing framework](https://spark.apache.org/) and shares the same API as the [SparkML/MLLib library](https://spark.apache.org/mllib/), allowing you to seamlessly embed SynapseML models into existing Apache Spark workflows.
+
+With SynapseML, you can build scalable and intelligent systems to solve challenges in domains such as anomaly detection, computer vision, deep learning, text analytics, and others. SynapseML can train and evaluate models on single-node, multi-node, and elastically resizable clusters of computers. This lets you scale your work without wasting resources. SynapseML is usable across Python, R, Scala, Java, and .NET. Furthermore, its API abstracts over a wide variety of databases, file systems, and cloud data stores to simplify experiments no matter where data is located.
+
+SynapseML requires Scala 2.12, Spark 3.2+, and Python 3.8+.
+
+import Link from '@docusaurus/Link';
+
+Get Started
+
+## Papers
+
+- [Large Scale Intelligent Microservices](https://arxiv.org/abs/2009.08044)
+
+- [Conditional Image Retrieval](https://arxiv.org/abs/2007.07177)
+
+- [SynapseML: Unifying Machine Learning Ecosystems at Massive Scales](https://arxiv.org/abs/1810.08744)
+
+- [Flexible and Scalable Deep Learning with MMLSpark](https://arxiv.org/abs/1804.04031)
diff --git a/website/docs/documentation/estimators/_LightGBM.md b/docs/Quick Examples/estimators/_LightGBM.md
similarity index 100%
rename from website/docs/documentation/estimators/_LightGBM.md
rename to docs/Quick Examples/estimators/_LightGBM.md
diff --git a/website/docs/documentation/estimators/_VW.md b/docs/Quick Examples/estimators/_VW.md
similarity index 100%
rename from website/docs/documentation/estimators/_VW.md
rename to docs/Quick Examples/estimators/_VW.md
diff --git a/website/versioned_docs/version-0.11.0/documentation/estimators/causal/_causalInferenceDML.md b/docs/Quick Examples/estimators/causal/_causalInferenceDML.md
similarity index 100%
rename from website/versioned_docs/version-0.11.0/documentation/estimators/causal/_causalInferenceDML.md
rename to docs/Quick Examples/estimators/causal/_causalInferenceDML.md
diff --git a/website/docs/documentation/estimators/cognitive/_MAD.md b/docs/Quick Examples/estimators/cognitive/_MAD.md
similarity index 100%
rename from website/docs/documentation/estimators/cognitive/_MAD.md
rename to docs/Quick Examples/estimators/cognitive/_MAD.md
diff --git a/website/docs/documentation/estimators/core/_AutoML.md b/docs/Quick Examples/estimators/core/_AutoML.md
similarity index 100%
rename from website/docs/documentation/estimators/core/_AutoML.md
rename to docs/Quick Examples/estimators/core/_AutoML.md
diff --git a/website/docs/documentation/estimators/core/_Featurize.md b/docs/Quick Examples/estimators/core/_Featurize.md
similarity index 100%
rename from website/docs/documentation/estimators/core/_Featurize.md
rename to docs/Quick Examples/estimators/core/_Featurize.md
diff --git a/website/docs/documentation/estimators/core/_IsolationForest.md b/docs/Quick Examples/estimators/core/_IsolationForest.md
similarity index 100%
rename from website/docs/documentation/estimators/core/_IsolationForest.md
rename to docs/Quick Examples/estimators/core/_IsolationForest.md
diff --git a/website/docs/documentation/estimators/core/_NN.md b/docs/Quick Examples/estimators/core/_NN.md
similarity index 100%
rename from website/docs/documentation/estimators/core/_NN.md
rename to docs/Quick Examples/estimators/core/_NN.md
diff --git a/website/docs/documentation/estimators/core/_Recommendation.md b/docs/Quick Examples/estimators/core/_Recommendation.md
similarity index 100%
rename from website/docs/documentation/estimators/core/_Recommendation.md
rename to docs/Quick Examples/estimators/core/_Recommendation.md
diff --git a/website/docs/documentation/estimators/core/_Stages.md b/docs/Quick Examples/estimators/core/_Stages.md
similarity index 100%
rename from website/docs/documentation/estimators/core/_Stages.md
rename to docs/Quick Examples/estimators/core/_Stages.md
diff --git a/website/docs/documentation/estimators/core/_Train.md b/docs/Quick Examples/estimators/core/_Train.md
similarity index 100%
rename from website/docs/documentation/estimators/core/_Train.md
rename to docs/Quick Examples/estimators/core/_Train.md
diff --git a/website/versioned_docs/version-0.11.0/documentation/estimators/estimators_causal.md b/docs/Quick Examples/estimators/estimators_causal.md
similarity index 100%
rename from website/versioned_docs/version-0.11.0/documentation/estimators/estimators_causal.md
rename to docs/Quick Examples/estimators/estimators_causal.md
diff --git a/website/docs/documentation/estimators/estimators_cognitive.md b/docs/Quick Examples/estimators/estimators_cognitive.md
similarity index 100%
rename from website/docs/documentation/estimators/estimators_cognitive.md
rename to docs/Quick Examples/estimators/estimators_cognitive.md
diff --git a/website/docs/documentation/estimators/estimators_core.md b/docs/Quick Examples/estimators/estimators_core.md
similarity index 100%
rename from website/docs/documentation/estimators/estimators_core.md
rename to docs/Quick Examples/estimators/estimators_core.md
diff --git a/website/docs/documentation/estimators/estimators_lightgbm.md b/docs/Quick Examples/estimators/estimators_lightgbm.md
similarity index 100%
rename from website/docs/documentation/estimators/estimators_lightgbm.md
rename to docs/Quick Examples/estimators/estimators_lightgbm.md
diff --git a/website/docs/documentation/estimators/estimators_vw.md b/docs/Quick Examples/estimators/estimators_vw.md
similarity index 100%
rename from website/docs/documentation/estimators/estimators_vw.md
rename to docs/Quick Examples/estimators/estimators_vw.md
diff --git a/website/docs/documentation/transformers/_OpenCV.md b/docs/Quick Examples/transformers/_OpenCV.md
similarity index 100%
rename from website/docs/documentation/transformers/_OpenCV.md
rename to docs/Quick Examples/transformers/_OpenCV.md
diff --git a/website/docs/documentation/transformers/_VW.md b/docs/Quick Examples/transformers/_VW.md
similarity index 100%
rename from website/docs/documentation/transformers/_VW.md
rename to docs/Quick Examples/transformers/_VW.md
diff --git a/website/docs/documentation/transformers/cognitive/_AnomalyDetection.md b/docs/Quick Examples/transformers/cognitive/_AnomalyDetection.md
similarity index 100%
rename from website/docs/documentation/transformers/cognitive/_AnomalyDetection.md
rename to docs/Quick Examples/transformers/cognitive/_AnomalyDetection.md
diff --git a/website/docs/documentation/transformers/cognitive/_AzureSearch.md b/docs/Quick Examples/transformers/cognitive/_AzureSearch.md
similarity index 100%
rename from website/docs/documentation/transformers/cognitive/_AzureSearch.md
rename to docs/Quick Examples/transformers/cognitive/_AzureSearch.md
diff --git a/website/docs/documentation/transformers/cognitive/_BingImageSearch.md b/docs/Quick Examples/transformers/cognitive/_BingImageSearch.md
similarity index 100%
rename from website/docs/documentation/transformers/cognitive/_BingImageSearch.md
rename to docs/Quick Examples/transformers/cognitive/_BingImageSearch.md
diff --git a/website/docs/documentation/transformers/cognitive/_ComputerVision.md b/docs/Quick Examples/transformers/cognitive/_ComputerVision.md
similarity index 100%
rename from website/docs/documentation/transformers/cognitive/_ComputerVision.md
rename to docs/Quick Examples/transformers/cognitive/_ComputerVision.md
diff --git a/website/docs/documentation/transformers/cognitive/_Face.md b/docs/Quick Examples/transformers/cognitive/_Face.md
similarity index 100%
rename from website/docs/documentation/transformers/cognitive/_Face.md
rename to docs/Quick Examples/transformers/cognitive/_Face.md
diff --git a/website/docs/documentation/transformers/cognitive/_FormRecognizer.md b/docs/Quick Examples/transformers/cognitive/_FormRecognizer.md
similarity index 100%
rename from website/docs/documentation/transformers/cognitive/_FormRecognizer.md
rename to docs/Quick Examples/transformers/cognitive/_FormRecognizer.md
diff --git a/website/docs/documentation/transformers/cognitive/_SpeechToText.md b/docs/Quick Examples/transformers/cognitive/_SpeechToText.md
similarity index 100%
rename from website/docs/documentation/transformers/cognitive/_SpeechToText.md
rename to docs/Quick Examples/transformers/cognitive/_SpeechToText.md
diff --git a/website/docs/documentation/transformers/cognitive/_TextAnalytics.md b/docs/Quick Examples/transformers/cognitive/_TextAnalytics.md
similarity index 100%
rename from website/docs/documentation/transformers/cognitive/_TextAnalytics.md
rename to docs/Quick Examples/transformers/cognitive/_TextAnalytics.md
diff --git a/website/docs/documentation/transformers/cognitive/_Translator.md b/docs/Quick Examples/transformers/cognitive/_Translator.md
similarity index 100%
rename from website/docs/documentation/transformers/cognitive/_Translator.md
rename to docs/Quick Examples/transformers/cognitive/_Translator.md
diff --git a/website/docs/documentation/transformers/core/_Explainers.md b/docs/Quick Examples/transformers/core/_Explainers.md
similarity index 100%
rename from website/docs/documentation/transformers/core/_Explainers.md
rename to docs/Quick Examples/transformers/core/_Explainers.md
diff --git a/website/docs/documentation/transformers/core/_Featurize.md b/docs/Quick Examples/transformers/core/_Featurize.md
similarity index 100%
rename from website/docs/documentation/transformers/core/_Featurize.md
rename to docs/Quick Examples/transformers/core/_Featurize.md
diff --git a/website/docs/documentation/transformers/core/_IO.md b/docs/Quick Examples/transformers/core/_IO.md
similarity index 100%
rename from website/docs/documentation/transformers/core/_IO.md
rename to docs/Quick Examples/transformers/core/_IO.md
diff --git a/website/docs/documentation/transformers/core/_Image.md b/docs/Quick Examples/transformers/core/_Image.md
similarity index 100%
rename from website/docs/documentation/transformers/core/_Image.md
rename to docs/Quick Examples/transformers/core/_Image.md
diff --git a/website/docs/documentation/transformers/core/_Stages.md b/docs/Quick Examples/transformers/core/_Stages.md
similarity index 100%
rename from website/docs/documentation/transformers/core/_Stages.md
rename to docs/Quick Examples/transformers/core/_Stages.md
diff --git a/website/docs/documentation/transformers/core/_Train.md b/docs/Quick Examples/transformers/core/_Train.md
similarity index 100%
rename from website/docs/documentation/transformers/core/_Train.md
rename to docs/Quick Examples/transformers/core/_Train.md
diff --git a/website/docs/documentation/transformers/deep_learning/_ONNXModel.md b/docs/Quick Examples/transformers/deep_learning/_ONNXModel.md
similarity index 100%
rename from website/docs/documentation/transformers/deep_learning/_ONNXModel.md
rename to docs/Quick Examples/transformers/deep_learning/_ONNXModel.md
diff --git a/website/docs/documentation/transformers/transformers_cognitive.md b/docs/Quick Examples/transformers/transformers_cognitive.md
similarity index 100%
rename from website/docs/documentation/transformers/transformers_cognitive.md
rename to docs/Quick Examples/transformers/transformers_cognitive.md
diff --git a/website/docs/documentation/transformers/transformers_core.md b/docs/Quick Examples/transformers/transformers_core.md
similarity index 100%
rename from website/docs/documentation/transformers/transformers_core.md
rename to docs/Quick Examples/transformers/transformers_core.md
diff --git a/website/docs/documentation/transformers/transformers_deep_learning.md b/docs/Quick Examples/transformers/transformers_deep_learning.md
similarity index 100%
rename from website/docs/documentation/transformers/transformers_deep_learning.md
rename to docs/Quick Examples/transformers/transformers_deep_learning.md
diff --git a/website/docs/documentation/transformers/transformers_opencv.md b/docs/Quick Examples/transformers/transformers_opencv.md
similarity index 100%
rename from website/docs/documentation/transformers/transformers_opencv.md
rename to docs/Quick Examples/transformers/transformers_opencv.md
diff --git a/website/docs/documentation/transformers/transformers_vw.md b/docs/Quick Examples/transformers/transformers_vw.md
similarity index 100%
rename from website/docs/documentation/transformers/transformers_vw.md
rename to docs/Quick Examples/transformers/transformers_vw.md
diff --git a/website/versioned_docs/version-0.10.1/reference/contributing_guide.md b/docs/Reference/Contributor Guide.md
similarity index 97%
rename from website/versioned_docs/version-0.10.1/reference/contributing_guide.md
rename to docs/Reference/Contributor Guide.md
index 341edbd548..e841340082 100644
--- a/website/versioned_docs/version-0.10.1/reference/contributing_guide.md
+++ b/docs/Reference/Contributor Guide.md
@@ -1,8 +1,8 @@
---
-title: Contributing Guide
+title: Contributor Guide
hide_title: true
-sidebar_label: Contributing Guide
-description: Contributing Guide
+sidebar_label: Contributor Guide
+description: Contributor Guide
---
## Interested in contributing to SynapseML? We're excited to work with you.
diff --git a/website/docs/reference/developer-readme.md b/docs/Reference/Developer Setup.md
similarity index 97%
rename from website/docs/reference/developer-readme.md
rename to docs/Reference/Developer Setup.md
index d22bc529fe..b448a2910c 100644
--- a/website/docs/reference/developer-readme.md
+++ b/docs/Reference/Developer Setup.md
@@ -1,8 +1,8 @@
---
-title: Development Setup and Building From Source
+title: Developer Setup
hide_title: true
-sidebar_label: Development Setup
-description: SynapseML Development Setup
+sidebar_label: Developer Setup
+description: Developer Setup
---
# SynapseML Development Setup
diff --git a/website/docs/reference/docker.md b/docs/Reference/Docker Setup.md
similarity index 99%
rename from website/docs/reference/docker.md
rename to docs/Reference/Docker Setup.md
index b92f92b772..b65dcb4394 100644
--- a/website/docs/reference/docker.md
+++ b/docs/Reference/Docker Setup.md
@@ -1,7 +1,7 @@
---
-title: Using the SynapseML Docker Image
-sidebar_label: Docker Image
-description: Using the SynapseML Docker Image
+title: Docker Setup
+sidebar_label: Docker Setup
+description: Docker Setup
---
## Quickstart: install and run the Docker image
diff --git a/website/versioned_docs/version-0.11.2/reference/dotnet-setup.md b/docs/Reference/Dotnet Setup.md
similarity index 99%
rename from website/versioned_docs/version-0.11.2/reference/dotnet-setup.md
rename to docs/Reference/Dotnet Setup.md
index e839b7548e..f0ccd7d7b6 100644
--- a/website/versioned_docs/version-0.11.2/reference/dotnet-setup.md
+++ b/docs/Reference/Dotnet Setup.md
@@ -2,7 +2,7 @@
title: .NET setup
hide_title: true
sidebar_label: .NET setup
-description: .NET setup and example for SynapseML
+description: .NET setup
---
import Tabs from '@theme/Tabs';
diff --git a/website/versioned_docs/version-0.11.2/getting_started/dotnet_example.md b/docs/Reference/Quickstart - LightGBM in Dotnet.md
similarity index 98%
rename from website/versioned_docs/version-0.11.2/getting_started/dotnet_example.md
rename to docs/Reference/Quickstart - LightGBM in Dotnet.md
index fd56c5a83f..579c1b897c 100644
--- a/website/versioned_docs/version-0.11.2/getting_started/dotnet_example.md
+++ b/docs/Reference/Quickstart - LightGBM in Dotnet.md
@@ -1,11 +1,11 @@
---
-title: .NET Example with LightGBMClassifier
-sidebar_label: .NET example
+title: Quickstart - LightGBM in Dotnet
+sidebar_label: Quickstart - LightGBM in Dotnet
description: A simple example about classification with LightGBMClassifier using .NET
---
:::note
-Make sure you have followed the guidance in [.NET installation](../reference/dotnet-setup.md) before jumping into this example.
+Make sure you have followed the guidance in [.NET installation](../Dotnet%20Setup) before jumping into this example.
:::
## Classification with LightGBMClassifier
diff --git a/website/versioned_docs/version-0.11.2/reference/R-setup.md b/docs/Reference/R Setup.md
similarity index 98%
rename from website/versioned_docs/version-0.11.2/reference/R-setup.md
rename to docs/Reference/R Setup.md
index 1cb70d19dd..45f98a91f5 100644
--- a/website/versioned_docs/version-0.11.2/reference/R-setup.md
+++ b/docs/Reference/R Setup.md
@@ -132,7 +132,7 @@ ml_train_regressor(faithful_df, labelCol="eruptions", unfit_model)
## Building from Source
Our R bindings are built as part of the [normal build
-process](developer-readme.md). To get a quick build, start at the root
+process](../Developer%20Setup). To get a quick build, start at the root
of the synapseml directory, and find the generated files. For instance,
to find the R files for deep-learning, run
diff --git a/website/versioned_docs/version-0.11.0/mlflow/autologging.md b/docs/Use with MLFlow/Autologging.md
similarity index 97%
rename from website/versioned_docs/version-0.11.0/mlflow/autologging.md
rename to docs/Use with MLFlow/Autologging.md
index 76149e72fb..b440434e3e 100644
--- a/website/versioned_docs/version-0.11.0/mlflow/autologging.md
+++ b/docs/Use with MLFlow/Autologging.md
@@ -1,5 +1,6 @@
---
-title: SynapseML Autologging
+title: Autologging
+sidebar_label: Autologging
description: SynapseML autologging
---
@@ -23,7 +24,7 @@ Note:
## Configuration process in Databricks as an example
-1. Install latest MLflow via `%pip install mlflow -u`
+1. Install latest MLflow via `%pip install mlflow`
2. Upload your customized `log_model_allowlist.txt` file to dbfs by clicking File/Upload Data button on Databricks UI.
3. Set Cluster Spark configuration following [this documentation](https://docs.microsoft.com/en-us/azure/databricks/clusters/configure#spark-configuration)
```
diff --git a/docs/Use with MLFlow/Install.md b/docs/Use with MLFlow/Install.md
new file mode 100644
index 0000000000..f85f524812
--- /dev/null
+++ b/docs/Use with MLFlow/Install.md
@@ -0,0 +1,4 @@
+---
+title: Install
+description: install Mlflow on different environments
+---
diff --git a/website/versioned_docs/version-0.11.0/mlflow/examples.md b/docs/Use with MLFlow/Overview.md
similarity index 53%
rename from website/versioned_docs/version-0.11.0/mlflow/examples.md
rename to docs/Use with MLFlow/Overview.md
index f1745b3aeb..c6956e72ad 100644
--- a/website/versioned_docs/version-0.11.0/mlflow/examples.md
+++ b/docs/Use with MLFlow/Overview.md
@@ -1,26 +1,91 @@
---
-title: Examples
-description: Examples using SynapseML with MLflow
+title: Overview
+description: MLflow support of SynapseML
---
-## Prerequisites
+## What is MLflow
-If you're using Databricks, install mlflow with this command:
+[MLflow](https://github.com/mlflow/mlflow) is a platform to streamline machine learning development, including tracking experiments, packaging code into reproducible runs, and sharing and deploying models. MLflow offers a set of lightweight APIs that can be used with any existing machine learning application or library, for instance TensorFlow, PyTorch, XGBoost, etc. It runs wherever you currently run ML code, for example, in notebooks, standalone applications or the cloud. MLflow's current components are:
+
+* [MLflow Tracking](https://mlflow.org/docs/latest/tracking.html): An API to log parameters, code, and results in machine learning experiments and compare them using an interactive UI.
+* [MLflow Projects](https://mlflow.org/docs/latest/projects.html): A code packaging format for reproducible runs using Conda and Docker, so you can share your ML code with others.
+* [MLflow Models](https://mlflow.org/docs/latest/models.html): A model packaging format and tools that let you easily deploy the same model from any ML library for both batch and real-time scoring. It supports platforms such as Docker, Apache Spark, Azure ML and AWS SageMaker.
+* [MLflow Model Registry](https://mlflow.org/docs/latest/model-registry.html): A centralized model store, set of APIs, and UI, to collaboratively manage the full lifecycle of MLflow Models.
+
+
+## Installation
+
+Install MLflow from PyPI via `pip install mlflow`
+
+MLflow requires `conda` to be on the `PATH` for the projects feature.
+
+Learn more about MLflow on their [GitHub page](https://github.com/mlflow/mlflow).
+
+
+### Install Mlflow on Databricks
+
+If you're using Databricks, install Mlflow with this command:
```
-# run this so that mlflow is installed on workers besides driver
+# run this so that Mlflow is installed on workers besides driver
%pip install mlflow
```
-Install SynapseML based on the [installation guidance](../getting_started/installation.md).
+### Install Mlflow on Synapse
+To log model with Mlflow, you need to create an Azure Machine Learning workspace and link it with your Synapse workspace.
+
+#### Create Azure Machine Learning Workspace
+
+Follow this document to create [AML workspace](https://learn.microsoft.com/en-us/azure/machine-learning/quickstart-create-resources#create-the-workspace). You don't need to create compute instance and compute clusters.
+
+#### Create an Azure ML Linked Service
+
+
+
+- In the Synapse workspace, go to **Manage** -> **External connections** -> **Linked services**, select **+ New**
+- Select the workspace you want to log the model in and create the linked service. You need the **name of the linked service** to set up connection.
-## API Reference
+#### Auth Synapse Workspace
+
+
+- Go to the **Azure Machine Learning workspace** resource -> **access control (IAM)** -> **Role assignment**, select **+ Add**, choose **Add role assignment**
+- Choose **contributor**, select next
+- In members page, choose **Managed identity**, select **+ select members**. Under **managed identity**, choose Synapse workspace. Under **Select**, choose the workspace you run your experiment on. Click **Select**, **Review + assign**.
+
+
+#### Use MLFlow in Synapse with Linked Service
+Set up connection
+```python
+
+#AML workspace authentication using linked service
+from notebookutils.mssparkutils import azureML
+linked_service_name = "YourLinkedServiceName"
+ws = azureML.getWorkspace(linked_service_name)
+mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
+
+#Set MLflow experiment.
+experiment_name = "synapse-mlflow-experiment"
+mlflow.set_experiment(experiment_name)
+```
+
+#### Use MLFlow in Synapse without a Linked Service
+Once you create an AML workspace, you can obtain the MLflow tracking URL directly. The AML start page is where you can locate the MLflow tracking URL.
+
+You can set it tracking url with
+```python
+mlflow.set_tracking_uri("your mlflow tracking url")
+```
+
+
+## MLFlow API Reference
* [mlflow.spark.save_model](https://www.mlflow.org/docs/latest/python_api/mlflow.spark.html#mlflow.spark.save_model)
* [mlflow.spark.log_model](https://www.mlflow.org/docs/latest/python_api/mlflow.spark.html#mlflow.spark.log_model)
* [mlflow.spark.load_model](https://www.mlflow.org/docs/latest/python_api/mlflow.spark.html#mlflow.spark.load_model)
* [mlflow.log_metric](https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.log_metric)
-## LightGBMClassificationModel
+## Examples
+
+### LightGBMClassifier
```python
import mlflow
@@ -95,7 +160,7 @@ with mlflow.start_run():
mlflow.log_metric("accuracy", metrics[0]['accuracy'])
```
-## Cognitive Services
+### Cognitive Services
```python
import mlflow
diff --git a/docs/manifest.yaml b/docs/manifest.yaml
deleted file mode 100644
index d16b02a576..0000000000
--- a/docs/manifest.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-version: 0.1
-channels:
- - name: website
- is_active: true
- notebooks:
- - path: "./notebooks/features/"
diff --git a/docs/python/documentprojection/__init__.py b/docs/python/documentprojection/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/docs/python/documentprojection/__main__.py b/docs/python/documentprojection/__main__.py
deleted file mode 100644
index fb25b42fc3..0000000000
--- a/docs/python/documentprojection/__main__.py
+++ /dev/null
@@ -1,112 +0,0 @@
-from __future__ import absolute_import
-
-from .utils.manifest import parse_manifest
-from .utils.reflection import *
-from .utils.logging import *
-from .utils.notebook import *
-from .framework.pipeline import *
-from .channels import default_channels
-
-import re
-
-log = get_log(__name__)
-
-
-def get_channel_map(custom_channels_folder, cwd):
-
- sys.path.insert(0, "documentprojection")
-
- def camel_to_snake(name):
- s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
- s2 = re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
- return s2.replace("_channel", "")
-
- channels = default_channels.copy()
- if custom_channels_folder is not None and len(custom_channels_folder) > 0:
- channels.extend(get_channels_from_dir(custom_channels_folder, cwd))
- log.info(f"All channels: {channels}")
-
- channel_map = {
- k: v
- for k, v in [
- (camel_to_snake(channel.__name__), channel) for channel in channels
- ]
- }
- return channel_map
-
-
-def parse_args():
- log_level_choices = ["debug", "info", "warn", "error", "critical"]
-
- import argparse
-
- parser = argparse.ArgumentParser(description="Document Projection Pipeline")
- parser.add_argument(
- "project_root",
- metavar="ROOT",
- type=str,
- help="the root directory of the project",
- default=".",
- )
- parser.add_argument(
- "manifest",
- metavar="MANIFEST",
- type=str,
- help="a notebook or folder of notebooks to project",
- default="docs/manifest.yaml",
- )
- parser.add_argument(
- "-f", "--format", action="store_true", default=False, help="run only formatters"
- )
- parser.add_argument(
- "-p",
- "--publish",
- action="store_true",
- default=False,
- help="run publishers. forces -t and -f.",
- )
- parser.add_argument(
- "-c",
- "--channels",
- default="console",
- type=str,
- help="A channel or comma-separated list of channels through which the notebook(s) should be processed. defaults to console if not specified.",
- )
- parser.add_argument(
- "--customchannels",
- type=str,
- default=None,
- help="A folder containing custom channel implementations.",
- )
- parser.add_argument(
- "-v",
- "--loglevel",
- choices=log_level_choices,
- default="info",
- help="set log level",
- )
- return parser.parse_args()
-
-
-def run():
- args = parse_args()
- config_log(args.loglevel)
- log.debug("script executed with args: {}".format(args))
-
- args.project_root = os.path.abspath(args.project_root)
-
- if args.manifest is not None:
- import json
-
- log.info(f"Reading manifest file: {args.manifest}.")
- args.manifest = parse_manifest(args.manifest)
- log.debug(f"Manifest:\n{json.dumps(args.manifest, indent=4, sort_keys=True)}")
-
- channel_map = get_channel_map(args.customchannels, args.project_root)
- pipeline = DocumentProjectionPipeline(
- channel_map, config=PipelineConfig(vars(args))
- )
- pipeline.run()
-
-
-run()
diff --git a/docs/python/documentprojection/channels/__init__.py b/docs/python/documentprojection/channels/__init__.py
deleted file mode 100644
index 9bcca8458f..0000000000
--- a/docs/python/documentprojection/channels/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from ..utils.reflection import get_subclasses
-from ..framework import Channel
-
-default_channels = get_subclasses(__name__, Channel)
diff --git a/docs/python/documentprojection/channels/console.py b/docs/python/documentprojection/channels/console.py
deleted file mode 100644
index 1d034cd465..0000000000
--- a/docs/python/documentprojection/channels/console.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from ..framework import *
-from ..utils.logging import get_log
-from ..framework.markdown import MarkdownFormatter
-
-log = get_log(__name__)
-
-# A sample Console (no-operation) channel that 'publishes' to the console. Useful for testing.
-class ConsoleDoc(Document):
- def __init__(self, content, metadata):
- self.content = content
- self.metadata = metadata
-
-
-class ConsoleFormatter(MarkdownFormatter):
- def clean_markdown(self, markdown: str) -> str:
- return markdown
-
- def get_header(self, notebook: Notebook) -> str:
- return "This is a test header injected by the 'console' formatter."
-
- def get_metadata(self, notebook: Notebook) -> dict:
- notebook.metadata.update(
- {"source_path": notebook.path, "target_path": "stdout"}
- )
- return notebook.metadata
-
-
-class ConsolePublisher(Publisher):
- def publish(self, document: Document) -> bool:
- print(document.content)
- return True
-
-
-class ConsoleChannel(Channel):
- def __init__(self, _):
- self.formatter = ConsoleFormatter()
- self.publisher = ConsolePublisher()
diff --git a/docs/python/documentprojection/framework/__init__.py b/docs/python/documentprojection/framework/__init__.py
deleted file mode 100644
index 0e86c7e5c5..0000000000
--- a/docs/python/documentprojection/framework/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .objects import *
diff --git a/docs/python/documentprojection/framework/markdown.py b/docs/python/documentprojection/framework/markdown.py
deleted file mode 100644
index bbe518e8c4..0000000000
--- a/docs/python/documentprojection/framework/markdown.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from abc import ABC, abstractmethod
-from nbconvert import MarkdownExporter
-
-from ..framework.objects import *
-
-# Sample base formatter for documents that are projected to markdown, include some cleaning, and inject a header
-class MarkdownFormatter(ABC):
- def _add_header(markdown: str, header: str):
- content = f"{header}\n{markdown}"
- return content
-
- def _to_markdown(notebook: Notebook) -> str:
- exporter = MarkdownExporter()
- markdown, _ = exporter.from_notebook_node(notebook.data)
- return markdown
-
- @abstractmethod
- def clean_markdown(self, markdown: str) -> str:
- pass
-
- @abstractmethod
- def get_header(self, notebook: Notebook) -> str:
- pass
-
- def format(self, notebook: Notebook) -> Document:
- markdown = MarkdownFormatter._to_markdown(notebook)
- markdown = self.clean_markdown(markdown)
- content = MarkdownFormatter._add_header(markdown, self.get_header(notebook))
- return Document(content, self.get_metadata(notebook))
diff --git a/docs/python/documentprojection/framework/objects.py b/docs/python/documentprojection/framework/objects.py
deleted file mode 100644
index c6d64e7c05..0000000000
--- a/docs/python/documentprojection/framework/objects.py
+++ /dev/null
@@ -1,98 +0,0 @@
-from abc import ABC, abstractmethod
-from ..utils.notebook import *
-from ..utils.logging import get_log
-
-
-def _defaultrepr(cls):
- def __repr__(self):
- return type(self).__name__
-
- cls.__repr__ = __repr__
- return cls
-
-
-class Notebook:
- def __init__(self, path, metadata: dict = {}):
- self.path: str = path
- self.metadata: dict = metadata
- self.data: NotebookNode = Notebook._parse(self.path)
- self._repr = "Notebook(...{})".format("\\".join(self.path.split("\\")[-3:]))
-
- def __repr__(self):
- return self._repr
-
- def _parse(path: str) -> NotebookNode:
- return read(path, as_version=4)
-
-
-class DocumentMetadata:
- def __init__(self, source_path: str, target_path: str, dimensions: dict = {}):
- self.source_path = source_path
- self.target_path = target_path
- self.dimensions = dimensions
- self.dimensions["target_path"] = target_path
-
- def __repr__(self):
- return f"{repr(self.source_path)}:{repr(self.dimensions)}"
-
-
-class Document:
- def __init__(self, content, metadata: DocumentMetadata):
- self.content = content
- self.metadata = metadata
-
- def __repr__(self):
- return f"{repr(self.metadata)}"
-
-
-@_defaultrepr
-class Formatter(ABC):
- @abstractmethod
- def format(self, notebook: Notebook) -> Document:
- pass
-
- @abstractmethod
- def get_metadata(self, notebook: Notebook) -> DocumentMetadata:
- pass
-
-
-@_defaultrepr
-class Publisher(ABC):
- @abstractmethod
- def publish(self, document: Document) -> bool:
- pass
-
-
-class ChannelMetadata(dict):
- def __init__(self, dict: dict):
- self.__dict__.update(dict)
-
- def __repr__(self):
- return repr(self.__dict__)
-
- project_root = None
-
-
-@_defaultrepr
-class Channel(ABC):
- def __init__(
- self, formatter: Formatter, publisher: Publisher, config: ChannelMetadata
- ):
- self.formatter: Formatter = formatter
- self.publisher: Publisher = publisher
- self.config: ChannelMetadata = config
-
- def format(self, notebook: Notebook) -> Document:
- instance_log = get_log(self.__class__.__name__)
- instance_log.debug(f"Formatting {notebook}")
- content = self.formatter.format(notebook)
- instance_log.debug(f"Done formatting {notebook}.")
- return content
-
- def publish(self, document: Document) -> bool:
- instance_log = get_log(self.__class__.__name__)
- instance_log.debug(f"Publishing {document}")
- succeeded = self.publisher.publish(document)
- instance_log.debug(
- f"Publishing {document} {'SUCCEEDED' if succeeded else 'FAILED'}"
- )
diff --git a/docs/python/documentprojection/framework/pipeline.py b/docs/python/documentprojection/framework/pipeline.py
deleted file mode 100644
index 55053f7d7a..0000000000
--- a/docs/python/documentprojection/framework/pipeline.py
+++ /dev/null
@@ -1,92 +0,0 @@
-from typing import List
-from ..utils.logging import get_log
-from .objects import *
-from ..utils.notebook import *
-from ..utils.parallelism import process_in_parallel
-
-log = get_log(__name__)
-
-
-class PipelineConfig(dict):
- def __init__(self, dict: dict):
- self.__dict__.update(dict)
-
- format = None
- publish = None
- channel = None
- project_root = None
- manifest = None
-
-
-class DocumentProjectionPipeline:
- def __init__(self, channel_map: dict, config: PipelineConfig = PipelineConfig({})):
- self.channel_map = channel_map
- self.config = config
-
- def run(self) -> None:
- log.debug(
- f"""DocumentProjectionPipeline running with:
- Mode: {self.config},
- Config: {self.config}"""
- )
-
- channels = self.config.manifest["channels"]
-
- if len(channels) == 0:
- raise Exception("No channels registered.")
-
- if not self.config.publish:
- log.warn(f"PUBLISH mode not enabled. Skipping publish step.")
-
- for channel_config in channels:
- if channel_config["name"] not in self.channel_map:
- raise Exception(
- f"Channel declared in manifest but no implementation was found: {channel_config['name']}. If this is a custom channel, make sure you have specified the custom channels folder."
- )
-
- for channel_config in channels:
- if channel_config["is_active"] == False:
- log.info(
- f"Skipping channel marked as inactive: {channel_config['name']}"
- )
- continue
- channel_metadata = ChannelMetadata(
- {
- key: self.config.__dict__[key]
- for key in ["project_root"]
- if key in self.config.__dict__
- }
- )
- if "metadata" in channel_config:
- channel_metadata.update(channel_config["metadata"])
- channel = self.channel_map[channel_config["name"]](channel_metadata)
-
- notebook_metadata = channel_config["notebooks"]
- notebooks = []
- for entry in notebook_metadata:
- parsed_notebooks = parse_notebooks([entry["path"]], recursive=True)
- notebooks.extend(
- [
- Notebook(parsed_notebook, metadata=entry)
- for parsed_notebook in parsed_notebooks
- ]
- )
- log.info(
- f"Processing {len(notebooks)} notebooks in parallel for: {repr(channel)}"
- )
-
- formatted_documents = process_in_parallel(channel.format, notebooks)
- if self.config.publish:
- process_in_parallel(channel.publish, formatted_documents)
- if self.config.format:
- for i in range(len(notebooks)):
- log.info(
- "Formatted content for {}:\n{}".format(
- notebooks[i], formatted_documents[i].content
- )
- )
- log.info(f"End formatted content for {notebooks[i]}")
-
-
-def collect_notebooks(paths: List[str], recursive: bool) -> List[Notebook]:
- return [Notebook(nb) for nb in parse_notebooks(paths, recursive)]
diff --git a/docs/python/documentprojection/utils/__init__.py b/docs/python/documentprojection/utils/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/docs/python/documentprojection/utils/logging.py b/docs/python/documentprojection/utils/logging.py
deleted file mode 100644
index 86cd8fc690..0000000000
--- a/docs/python/documentprojection/utils/logging.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import logging
-
-logging.basicConfig(
- level="INFO", format="%(asctime)s | %(name)s | %(levelname)s | %(message)s"
-)
-
-
-def config_log(level: str):
- logging.basicConfig(
- level=level.upper(),
- format="%(asctime)s | %(name)s | %(levelname)s | %(message)s",
- force=True,
- )
-
-
-def get_log(name: str):
- return logging.getLogger(name)
diff --git a/docs/python/documentprojection/utils/manifest.py b/docs/python/documentprojection/utils/manifest.py
deleted file mode 100644
index bb79165634..0000000000
--- a/docs/python/documentprojection/utils/manifest.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from .logging import *
-
-import yaml
-
-log = get_log(__name__)
-
-
-def parse_manifest(manifest_path):
- PARSER_VERSION = 0.1
- with open(manifest_path, "r") as file:
- try:
- data = yaml.safe_load(file)
- if float(data["version"]) > PARSER_VERSION:
- raise Exception(
- f"Manifest version {data['version']} is greater than parser version {PARSER_VERSION}. Failing."
- )
- return data
- except yaml.YAMLError as error:
- log.error("Failed to parse manifest file. Failing.")
- raise error
diff --git a/docs/python/documentprojection/utils/mock_notebook.ipynb b/docs/python/documentprojection/utils/mock_notebook.ipynb
deleted file mode 100644
index c5c538e63c..0000000000
--- a/docs/python/documentprojection/utils/mock_notebook.ipynb
+++ /dev/null
@@ -1,35 +0,0 @@
-{
- "cells": [
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "# Mock Title\n",
- "Mock Text"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "# Mock Comment\n",
- "print(\"Mock Print Statement\")"
- ]
- }
- ],
- "metadata": {
- "language_info": {
- "name": "python"
- },
- "tags": []
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
diff --git a/docs/python/documentprojection/utils/notebook.py b/docs/python/documentprojection/utils/notebook.py
deleted file mode 100644
index a5b6115ffb..0000000000
--- a/docs/python/documentprojection/utils/notebook.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import os
-from typing import List
-from nbformat import NotebookNode, read
-from pathlib import Path
-from .logging import get_log
-
-log = get_log(__name__)
-
-
-def get_mock_path():
- return str(
- os.path.join(os.path.dirname(os.path.realpath(__file__)), "mock_notebook.ipynb")
- )
-
-
-def parse_notebooks(notebooks: List[str], recursive=False) -> List[str]:
- if type(notebooks) is not list:
- raise ValueError(
- f"Notebooks must be a list of paths. Received {type(notebooks)}."
- )
- concrete_notebook_paths = []
- ignored_directories = []
- for notebook in notebooks:
- notebook = os.path.abspath(notebook)
- if not os.path.exists(notebook):
- raise ValueError(
- f"Specified notebook path {repr(notebook)} does not exist."
- )
- is_dir = os.path.isdir(notebook)
- if not is_dir:
- if not notebook.endswith(".ipynb"):
- raise ValueError(
- f"Specified notebook path {notebook} is not a notebook. Notebooks must have a .ipynb extension."
- )
- concrete_notebook_paths.append(notebook)
-
- # non-recursively scan for notebooks in the given directory
- if is_dir and not recursive:
- for file_or_dir in os.listdir(notebook):
- abs_path = os.path.join(notebook, file_or_dir)
- if file_or_dir.endswith(".ipynb"):
- concrete_notebook_paths.append(abs_path)
- if os.path.isdir(abs_path):
- ignored_directories.append(abs_path)
-
- if is_dir and recursive:
- for root, _, files in os.walk(notebook):
- for file_or_dir in files:
- if file_or_dir.endswith(".ipynb"):
- concrete_notebook_paths.append(os.path.join(root, file_or_dir))
-
- if len(ignored_directories) > 0 and not recursive:
- log.warn(
- "Recursive flag is not set. Ignoring the following directories:\n {}".format(
- "\n ".join(ignored_directories)
- )
- )
-
- num_notebooks = len(concrete_notebook_paths)
- leveled_log = log.warning if num_notebooks == 0 else log.debug
- leveled_log(
- f"Found {num_notebooks} notebooks to process: {repr(concrete_notebook_paths)}"
- )
- return concrete_notebook_paths
diff --git a/docs/python/documentprojection/utils/parallelism.py b/docs/python/documentprojection/utils/parallelism.py
deleted file mode 100644
index bfe5c328b3..0000000000
--- a/docs/python/documentprojection/utils/parallelism.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import os
-import concurrent.futures
-import threading
-from typing import List
-from tqdm import tqdm
-
-_global_lock = threading.Lock()
-
-_locks = {}
-
-
-def process_in_parallel(func, data: List):
- results = []
- with tqdm(total=len(data)) as progress:
- with concurrent.futures.ThreadPoolExecutor(
- max_workers=os.cpu_count()
- ) as executor:
- for result in executor.map(func, data):
- progress.update()
- results.append(result)
- return results
-
-
-def get_global_lock():
- return _global_lock
-
-
-def get_lock(key):
- with _global_lock:
- if key not in _locks:
- _locks[key] = threading.Lock()
- return _locks[key]
diff --git a/docs/python/documentprojection/utils/reflection.py b/docs/python/documentprojection/utils/reflection.py
deleted file mode 100644
index 7dc817a6f0..0000000000
--- a/docs/python/documentprojection/utils/reflection.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import sys
-import inspect
-import importlib
-import inspect
-import os
-import sys
-from pathlib import Path
-import pathlib
-
-from ..utils.logging import get_log
-
-log = get_log(__name__)
-
-
-def get_subclasses(module, class_):
- # Get the directory of the current module
- current_module_dir = os.path.dirname(sys.modules[module].__file__)
-
- # Get all the python files in the current module directory
- files = [f for f in os.listdir(current_module_dir) if f.endswith(".py")]
-
- # Dynamically import all modules in the current package
- modules = [
- importlib.import_module("." + f[:-3], module)
- for f in files
- if not f.startswith("__")
- ]
-
- # Get all members of each imported module
- all_members = [inspect.getmembers(module) for module in modules]
- all_members = [item[1] for sublist in all_members for item in sublist]
-
- # Filter out only the classes that are children of the Channel parent class
- return [
- m
- for m in all_members
- if inspect.isclass(m) and issubclass(m, class_) and m != class_
- ]
-
-
-def insert_current_module_into_syspath(cwd):
- current_file_path = Path(__file__)
- current_directory = current_file_path.parent.parent.parent
- import_path = os.path.relpath(current_directory.resolve(), cwd)
- sys.path.insert(0, import_path)
-
-
-def get_channels_from_dir(dir_, cwd):
- log.info(f"Importing channels from {dir_} with cwd {cwd}")
- insert_current_module_into_syspath(cwd)
- files = [
- file.absolute()
- for file in pathlib.Path(dir_).glob("**/*.py")
- if not file.absolute().name.startswith("__")
- ]
- modules = []
- for file_path in files:
- module_name = os.path.basename(file_path.resolve()).replace(".py", "")
- spec = importlib.util.spec_from_file_location(module_name, file_path)
- module = importlib.util.module_from_spec(spec)
- modules.append(module)
- sys.modules[module_name] = module
- spec.loader.exec_module(module)
-
- log.info(f"found extra modules: {modules}")
-
- # Get all members of each imported module
- all_members = [inspect.getmembers(module) for module in modules]
- all_members = [item[1] for sublist in all_members for item in sublist]
-
- from documentprojection.framework.objects import Channel
-
- channels = [
- m
- for m in all_members
- if inspect.isclass(m) and issubclass(m, Channel) and m != Channel
- ]
-
- return channels
diff --git a/docs/python/synapseml_channels/website.py b/docs/python/synapseml_channels/website.py
deleted file mode 100644
index bd2693dc8e..0000000000
--- a/docs/python/synapseml_channels/website.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import os
-import re
-
-from documentprojection.framework import *
-from documentprojection.utils.logging import get_log
-from documentprojection.framework.markdown import MarkdownFormatter
-from documentprojection.utils.parallelism import get_lock
-
-log = get_log(__name__)
-
-
-def get_project_root() -> str:
- """Returns project root folder."""
- # root of parent module
- filepath = Path(os.path.abspath(os.path.join(os.getcwd(), __file__)))
- return os.path.abspath(filepath.parent.parent.parent.parent.parent)
-
-
-class WebsiteDoc(Document):
- def __init__(self, content, metadata):
- self.content = content
- self.metadata = metadata
-
-
-class WebsiteFormatter(MarkdownFormatter):
- def __init__(self, config: ChannelMetadata):
- self.config = config
-
- def clean_markdown(self, markdown: str) -> str:
- markdown = re.sub(r"style=\"[\S ]*?\"", "", markdown)
- markdown = re.sub(r"", "", markdown)
+ title = basename(input_file).replace(".ipynb", "")
+ markdown = f"---\ntitle: {title}\nhide_title: true\nstatus: stable\n---\n{markdown}"
+
+ os.makedirs(dirname(output_file), exist_ok=True)
+ with open(output_file, "w+", encoding="utf-8") as f:
+ f.write(markdown)
+ else:
+ if os.path.isdir(input_file):
+ os.makedirs(output_file, exist_ok=True)
+ else:
+ os.makedirs(dirname(output_file), exist_ok=True)
+ shutil.copy(input_file, output_file)
diff --git a/tools/docgen/docgen/core.py b/tools/docgen/docgen/core.py
new file mode 100644
index 0000000000..aacb70c582
--- /dev/null
+++ b/tools/docgen/docgen/core.py
@@ -0,0 +1,37 @@
+import multiprocessing
+from abc import ABC, abstractmethod
+from typing import List
+
+
+class Channel(ABC):
+ @abstractmethod
+ def process(self, input_file: str) -> ():
+ pass
+
+ @abstractmethod
+ def list_input_files(self) -> List[str]:
+ pass
+
+ def run(self) -> ():
+ for input_file in self.list_input_files():
+ self.process(input_file)
+
+
+class ParallelChannel(Channel):
+ def run(self) -> ():
+ with multiprocessing.Pool() as pool:
+ pool.map(self.process, self.list_input_files())
+
+
+class DocumentProcessor:
+ def __init__(self, channels: List[Channel]):
+ self.channels = channels
+
+ def run(self) -> None:
+ print(f"Running DocumentProcessor on {self.channels}")
+ if len(self.channels) == 0:
+ raise ValueError("No channels selected.")
+
+ for channel in self.channels:
+ print(f"Running Channel: {self.channels}")
+ channel.run()
diff --git a/tools/docgen/docgen/manifest.yaml b/tools/docgen/docgen/manifest.yaml
new file mode 100644
index 0000000000..6494e393e0
--- /dev/null
+++ b/tools/docgen/docgen/manifest.yaml
@@ -0,0 +1,4 @@
+channels:
+ - name: "channels.WebsiteChannel"
+ input_dir: "../../../docs/"
+ output_dir: "../../../website/docs/"
diff --git a/docs/python/setup.py b/tools/docgen/setup.py
similarity index 85%
rename from docs/python/setup.py
rename to tools/docgen/setup.py
index edf2a06679..7ac6f7ff57 100644
--- a/docs/python/setup.py
+++ b/tools/docgen/setup.py
@@ -5,13 +5,13 @@
from setuptools import setup, find_packages
setup(
- name="documentprojection",
+ name="docgen",
+ packages=["docgen"],
version=0.1,
description="Synapse Machine Learning Documentation Pipeline",
long_description="SynapseML contains Microsoft's open source "
+ "contributions to the Apache Spark ecosystem",
license="MIT",
- packages=find_packages(),
url="https://github.com/Microsoft/SynapseML",
author="Microsoft",
author_email="synapseml-support@microsoft.com",
@@ -25,9 +25,7 @@
"Programming Language :: Python :: 3",
],
zip_safe=True,
- package_data={
- "documentprojection": ["../LICENSE.txt", "../README.txt", "./utils/*.ipynb"]
- },
+ package_data={"docgen": ["../LICENSE.txt", "../README.txt"]},
python_requires=">=3.8.8",
install_requires=["nbformat", "nbconvert", "pathlib", "argparse"],
)
diff --git a/tools/docker/demo/Dockerfile b/tools/docker/demo/Dockerfile
index 6cccf99031..c0eeabbfd3 100644
--- a/tools/docker/demo/Dockerfile
+++ b/tools/docker/demo/Dockerfile
@@ -60,8 +60,8 @@ RUN jupyter-notebook --generate-config \
# Copy the init script for jupyter startup.
COPY tools/docker/demo/init_notebook.py /root/.ipython/profile_default/startup/init_notebook.py
-COPY notebooks notebooks
-WORKDIR notebooks/features
+COPY docs docs
+WORKDIR docs
# Jupyter Notebook UI
EXPOSE 8888
diff --git a/website/.gitignore b/website/.gitignore
index 3e53ef5c00..de06993cba 100644
--- a/website/.gitignore
+++ b/website/.gitignore
@@ -9,31 +9,7 @@
.cache-loader
# Converted markdowns
-/docs/features/*
-!/docs/features/causal_inference
-/docs/features/causal_inference/*
-!/docs/features/causal_inference/about.md
-!/docs/features/lightgbm
-/docs/features/lightgbm/*
-!/docs/features/lightgbm/about.md
-!/docs/features/onnx
-/docs/features/onnx/*
-!/docs/features/onnx/about.md
-!/docs/features/responsible_ai
-/docs/features/responsible_ai/*
-!/docs/features/responsible_ai/Data Balance Analysis.md
-!/docs/features/responsible_ai/Model Interpretation on Spark.md
-!/docs/features/simple_deep_learning
-/docs/features/simple_deep_learning/*
-!/docs/features/simple_deep_learning/about.md
-!/docs/features/simple_deep_learning/installation.md
-!/docs/features/spark_serving
-/docs/features/spark_serving/*
-!/docs/features/spark_serving/about.md
-!/docs/features/vw
-/docs/features/vw/*
-!/docs/features/vw/about.md
-/docs/features/hyperparameter_tuning/*
+/docs/*
# Misc
.DS_Store
diff --git a/website/blog/overview.md b/website/blog/overview.md
index ca7dc68689..09ea6eb811 100644
--- a/website/blog/overview.md
+++ b/website/blog/overview.md
@@ -1,15 +1,15 @@
---
-title: "Overview"
-description: "SynapseML Overview"
-keywords: [
- "SynapseML",
-]
+title: What is SynapseML?
+sidebar_label: What is SynapseML?
+hide_title: true
---
-Synapse Machine Learning expands the distributed computing framework [Apache Spark](https://github.com/apache/spark) in several new directions. SynapseML adds several machine learning frameworks to the SparkML Ecosystem, including [LightGBM](/docs/features/lightgbm/about), [Vowpal Wabbit](/docs/features/vw/about), [OpenCV](https://opencv.org/), [Isolation Forest](https://github.com/linkedin/isolation-forest), and the [Microsoft Cognitive Toolkit (CNTK)](https://www.microsoft.com/en-us/research/product/cognitive-toolkit/). These tools allow users to craft powerful and highly scalable models that span multiple ML ecosystems.
+import useBaseUrl from "@docusaurus/useBaseUrl";
-
+# What is SynapseML?
-SynapseML also brings new networking capabilities to the Spark ecosystem. With the HTTP on Spark project, users can embed any web service into their SparkML models and use their Spark clusters for massive networking workflows. In this vein, SynapseML provides easy to use SparkML transformers for a wide variety of Azure Cognitive Services. Finally, the Spark Serving project enables high throughput, submillisecond latency web services, backed by your Spark cluster.
+SynapseML (previously known as MMLSpark), is an open-source library that simplifies the creation of massively scalable machine learning (ML) pipelines. SynapseML provides simple, composable, and distributed APIs for a wide variety of different machine learning tasks such as text analytics, vision, anomaly detection, and many others. SynapseML is built on the [Apache Spark distributed computing framework](https://spark.apache.org/) and shares the same API as the [SparkML/MLLib library](https://spark.apache.org/mllib/), allowing you to seamlessly embed SynapseML models into existing Apache Spark workflows.
-Visit the SynapseML GitHub repository to learn more.
+With SynapseML, you can build scalable and intelligent systems to solve challenges in domains such as anomaly detection, computer vision, deep learning, text analytics, and others. SynapseML can train and evaluate models on single-node, multi-node, and elastically resizable clusters of computers. This lets you scale your work without wasting resources. SynapseML is usable across Python, R, Scala, Java, and .NET. Furthermore, its API abstracts over a wide variety of databases, file systems, and cloud data stores to simplify experiments no matter where data is located.
+
+SynapseML requires Scala 2.12, Spark 3.2+, and Python 3.8+.
diff --git a/website/docs/about.md b/website/docs/about.md
deleted file mode 100644
index 0220097d54..0000000000
--- a/website/docs/about.md
+++ /dev/null
@@ -1,55 +0,0 @@
----
-title: SynapseML
-sidebar_label: Introduction
-hide_title: true
----
-
-import useBaseUrl from "@docusaurus/useBaseUrl";
-
-
-
-# SynapseML
-
-SynapseML is an ecosystem of tools aimed towards expanding the distributed computing framework
-[Apache Spark](https://github.com/apache/spark) in several new directions.
-SynapseML adds many deep learning and data science tools to the Spark ecosystem,
-including seamless integration of Spark Machine Learning pipelines with [Microsoft Cognitive Toolkit
-(CNTK)](https://github.com/Microsoft/CNTK), [LightGBM](https://github.com/Microsoft/LightGBM) and
-[OpenCV](http://www.opencv.org/). These tools enable powerful and highly scalable predictive and analytical models
-for many types of datasources.
-
-SynapseML also brings new networking capabilities to the Spark Ecosystem. With the HTTP on Spark project, users
-can embed **any** web service into their SparkML models. In this vein, SynapseML provides easy to use
-SparkML transformers for a wide variety of [Azure Cognitive Services](https://azure.microsoft.com/en-us/services/cognitive-services/). For production grade deployment, the Spark Serving project enables high throughput,
-submillisecond latency web services, backed by your Spark cluster.
-
-SynapseML requires Scala 2.12, Spark 3.2+, and Python 3.8+.
-See the API documentation [for
-Scala](https://mmlspark.blob.core.windows.net/docs/0.11.2/scala/index.html#package) and [for
-PySpark](https://mmlspark.blob.core.windows.net/docs/0.11.2/pyspark/index.html).
-
-import Link from '@docusaurus/Link';
-
-Get Started
-
-## Examples
-
-import NotebookExamples from "@theme/NotebookExamples";
-
-
-
-## Explore our Features
-
-import FeatureCards from "@theme/FeatureCards";
-
-
-
-## Papers
-
-- [Large Scale Intelligent Microservices](https://arxiv.org/abs/2009.08044)
-
-- [Conditional Image Retrieval](https://arxiv.org/abs/2007.07177)
-
-- [SynapseML: Unifying Machine Learning Ecosystems at Massive Scales](https://arxiv.org/abs/1810.08744)
-
-- [Flexible and Scalable Deep Learning with MMLSpark](https://arxiv.org/abs/1804.04031)
diff --git a/website/docs/documentation/estimators/causal/_causalInferenceDML.md b/website/docs/documentation/estimators/causal/_causalInferenceDML.md
deleted file mode 100644
index d39eb7b35f..0000000000
--- a/website/docs/documentation/estimators/causal/_causalInferenceDML.md
+++ /dev/null
@@ -1,100 +0,0 @@
-import Tabs from '@theme/Tabs';
-import TabItem from '@theme/TabItem';
-import DocTable from "@theme/DocumentationTable";
-
-
-## DoubleMLEstimator
-
-
-
-
-
-
-```python
-from synapse.ml.causal import *
-from pyspark.ml.classification import LogisticRegression
-from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, BooleanType
-
-schema = StructType([
- StructField("Treatment", BooleanType()),
- StructField("Outcome", BooleanType()),
- StructField("col2", DoubleType()),
- StructField("col3", DoubleType()),
- StructField("col4", DoubleType())
- ])
-
-
-df = spark.createDataFrame([
- (False, True, 0.30, 0.66, 0.2),
- (True, False, 0.38, 0.53, 1.5),
- (False, True, 0.68, 0.98, 3.2),
- (True, False, 0.15, 0.32, 6.6),
- (False, True, 0.50, 0.65, 2.8),
- (True, True, 0.40, 0.54, 3.7),
- (False, True, 0.78, 0.97, 8.1),
- (True, False, 0.12, 0.32, 10.2),
- (False, True, 0.35, 0.63, 1.8),
- (True, False, 0.45, 0.57, 4.3),
- (False, True, 0.75, 0.97, 7.2),
- (True, True, 0.16, 0.32, 11.7)], schema
-)
-
-dml = (DoubleMLEstimator()
- .setTreatmentCol("Treatment")
- .setTreatmentModel(LogisticRegression())
- .setOutcomeCol("Outcome")
- .setOutcomeModel(LogisticRegression())
- .setMaxIter(20))
-
-dmlModel = dml.fit(df)
-dmlModel.getAvgTreatmentEffect()
-dmlModel.getConfidenceInterval()
-```
-
-
-
-
-```scala
-import com.microsoft.azure.synapse.ml.causal._
-import org.apache.spark.ml.classification.LogisticRegression
-
-val df = (Seq(
- (false, true, 0.50, 0.60, 0),
- (true, false, 0.40, 0.50, 1),
- (false, true, 0.78, 0.99, 2),
- (true, false, 0.12, 0.34, 3),
- (false, true, 0.50, 0.60, 0),
- (true, false, 0.40, 0.50, 1),
- (false, true, 0.78, 0.99, 2),
- (true, false, 0.12, 0.34, 3),
- (false, false, 0.50, 0.60, 0),
- (true, true, 0.40, 0.50, 1),
- (false, true, 0.78, 0.99, 2),
- (true, false, 0.12, 0.34, 3))
- .toDF("Treatment", "Outcome", "col2", "col3", "col4"))
-
-val dml = (new DoubleMLEstimator()
- .setTreatmentCol("Treatment")
- .setTreatmentModel(new LogisticRegression())
- .setOutcomeCol("Outcome")
- .setOutcomeModel(new LogisticRegression())
- .setMaxIter(20))
-
-val dmlModel = dml.fit(df)
-dmlModel.getAvgTreatmentEffect
-dmlModel.getConfidenceInterval
-```
-
-
-
-
-
diff --git a/website/docs/documentation/estimators/estimators_causal.md b/website/docs/documentation/estimators/estimators_causal.md
deleted file mode 100644
index 80ae2e5aaf..0000000000
--- a/website/docs/documentation/estimators/estimators_causal.md
+++ /dev/null
@@ -1,13 +0,0 @@
----
-title: Estimators - Causal
-sidebar_label: Causal Inference
-hide_title: true
----
-
-# Causal Inference
-
-import DoubleMLEstimator, {toc as DoubleMLEstimatorTOC} from './causal/_causalInferenceDML.md';
-
-
-
-export const toc = [...DoubleMLEstimatorTOC]
diff --git a/website/docs/features/lightgbm/about.md b/website/docs/features/lightgbm/about.md
deleted file mode 100644
index bed6b7b8ee..0000000000
--- a/website/docs/features/lightgbm/about.md
+++ /dev/null
@@ -1,262 +0,0 @@
----
-title: LightGBM
-hide_title: true
-sidebar_label: About
----
-
-# LightGBM on Apache Spark
-
-### LightGBM
-
-[LightGBM](https://github.com/Microsoft/LightGBM) is an open-source,
-distributed, high-performance gradient boosting (GBDT, GBRT, GBM, or
-MART) framework. This framework specializes in creating high-quality and
-GPU enabled decision tree algorithms for ranking, classification, and
-many other machine learning tasks. LightGBM is part of Microsoft's
-[DMTK](http://github.com/microsoft/dmtk) project.
-
-### Advantages of LightGBM through SynapseML
-
-- **Composability**: LightGBM models can be incorporated into existing
- SparkML Pipelines, and used for batch, streaming, and serving
- workloads.
-- **Performance**: LightGBM on Spark is 10-30% faster than SparkML on
- the Higgs dataset, and achieves a 15% increase in AUC. [Parallel
- experiments](https://github.com/Microsoft/LightGBM/blob/master/docs/Experiments.rst#parallel-experiment)
- have verified that LightGBM can achieve a linear speed-up by using
- multiple machines for training in specific settings.
-- **Functionality**: LightGBM offers a wide array of [tunable
- parameters](https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst),
- that one can use to customize their decision tree system. LightGBM on
- Spark also supports new types of problems such as quantile regression.
-- **Cross platform** LightGBM on Spark is available on Spark, PySpark, and SparklyR
-
-### Usage
-
-In PySpark, you can run the `LightGBMClassifier` via:
-
-```python
-from synapse.ml.lightgbm import LightGBMClassifier
-model = LightGBMClassifier(learningRate=0.3,
- numIterations=100,
- numLeaves=31).fit(train)
-```
-
-Similarly, you can run the `LightGBMRegressor` by setting the
-`application` and `alpha` parameters:
-
-```python
-from synapse.ml.lightgbm import LightGBMRegressor
-model = LightGBMRegressor(application='quantile',
- alpha=0.3,
- learningRate=0.3,
- numIterations=100,
- numLeaves=31).fit(train)
-```
-
-For an end to end application, check out the LightGBM [notebook
-example](../LightGBM%20-%20Overview).
-
-### Arguments/Parameters
-
-SynapseML exposes getters/setters for many common LightGBM parameters.
-In python, you can use property-value pairs, or in Scala use
-fluent setters. Examples of both are shown in this section.
-
-```scala
-import com.microsoft.azure.synapse.ml.lightgbm.LightGBMClassifier
-val classifier = new LightGBMClassifier()
- .setLearningRate(0.2)
- .setNumLeaves(50)
-```
-
-LightGBM has far more parameters than SynapseML exposes. For cases where you
-need to set some parameters that SynapseML doesn't expose a setter for, use
-passThroughArgs. This argument is just a free string that you can use to add extra parameters
-to the command SynapseML sends to configure LightGBM.
-
-In python:
-```python
-from synapse.ml.lightgbm import LightGBMClassifier
-model = LightGBMClassifier(passThroughArgs="force_row_wise=true min_sum_hessian_in_leaf=2e-3",
- numIterations=100,
- numLeaves=31).fit(train)
-```
-
-In Scala:
-```scala
-import com.microsoft.azure.synapse.ml.lightgbm.LightGBMClassifier
-val classifier = new LightGBMClassifier()
- .setPassThroughArgs("force_row_wise=true min_sum_hessian_in_leaf=2e-3")
- .setLearningRate(0.2)
- .setNumLeaves(50)
-```
-
-For formatting options and specific argument documentation, see
-[LightGBM docs](https://lightgbm.readthedocs.io/en/v3.3.2/Parameters.html). SynapseML sets some
-parameters specifically for the Spark distributed environment and
-shouldn't be changed. Some parameters are for CLI mode only, and don't work within
-Spark.
-
-You can mix *passThroughArgs* and explicit args, as shown in the example. SynapseML
-merges them to create one argument string to send to LightGBM. If you set a parameter in
-both places, *passThroughArgs* takes precedence.
-
-### Architecture
-
-LightGBM on Spark uses the Simple Wrapper and Interface Generator (SWIG)
-to add Java support for LightGBM. These Java Binding use the Java Native
-Interface call into the [distributed C++
-API](https://github.com/Microsoft/LightGBM/blob/master/include/LightGBM/c_api.h).
-
-We initialize LightGBM by calling
-[`LGBM_NetworkInit`](https://github.com/Microsoft/LightGBM/blob/master/include/LightGBM/c_api.h)
-with the Spark executors within a MapPartitions call. We then pass each
-workers partitions into LightGBM to create the in-memory distributed
-dataset for LightGBM. We can then train LightGBM to produce a model
-that can then be used for inference.
-
-The `LightGBMClassifier` and `LightGBMRegressor` use the SparkML API,
-inherit from the same base classes, integrate with SparkML pipelines,
-and can be tuned with [SparkML's cross
-validators](https://spark.apache.org/docs/latest/ml-tuning.html).
-
-Models built can be saved as SparkML pipeline with native LightGBM model
-using `saveNativeModel()`. Additionally, they're fully compatible with [PMML](https://en.wikipedia.org/wiki/Predictive_Model_Markup_Language) and
-can be converted to PMML format through the
-[JPMML-SparkML-LightGBM](https://github.com/alipay/jpmml-sparkml-lightgbm) plugin.
-
-#### Dynamic Allocation Limitations
-The native LightGBM library has a *distributed mode* that allows the algorithm to work over multiple *machines*. SynapseML
-uses this mode to call LightGBM from Spark. SynapseML first gathers all the Spark executor networking information, passes that to LightGBM, and then
-waits for LightGBM to complete its work. However, the native LightGBM algorithm implementation assumes all networking is constant over the time period of a single
-training or scoring session. The native LightGBM distributed mode was designed this way and isn't a limitation of SynapseML by itself.
-
-Dynamic compute changes can cause LightGBM problems if the Spark executors change during data processing. Spark can naturally
-take advantage of cluster autoscaling and can also dynamically replace any failed executor with another, but LightGBM can't
-handle these networking changes. Large datasets are affected in particular since they're more likely to cause executor scaling
-or have a single executor fail during a single processing pass.
-
-If you're experiencing problems with LightGBM as exposed through SynapseML due to executor changes (for example, occasional Task failures or networking hangs),
-there are several options.
-1. In the Spark platform, turn off any autoscaling on the cluster you have provisioned.
-2. Set *numTasks* manually to be smaller so that fewer executors are used (reducing probability of single executor failure).
-3. Turn off dynamic executor scaling with configuration in a notebook cell. In Synapse and Fabric, you can use:
-
-```python
- %%configure
- {
- "conf":
- {
- "spark.dynamicAllocation.enabled": "false"
- }
- }
-```
-Note: setting any custom configuration can affect cluster startup time if your compute platform takes advantage of "live pools"
-to improve notebook performance.
-
-If you still have problems, you can consider splitting your data into smaller segments using *numBatches*. Splitting into multiple
-batches increases total processing time, but can potentially be used to increase reliability.
-
-### Data Transfer Mode
-
-SynapseML must pass data from Spark partitions to LightGBM native Datasets before turning over control to
-the actual LightGBM execution code for training and inference. SynapseML has two modes
-that control how this data is transferred: *streaming* and *bulk*.
-This mode doesn't affect training but can affect memory usage and overall fit/transform time.
-
-#### Bulk Execution mode
-The "Bulk" mode is older and requires accumulating all data in executor memory before creating Datasets. This mode can cause
-OOM errors for large data, especially since the data must be accumulated in its original uncompressed double-format size.
-For now, "bulk" mode is the default since "streaming" is new, but SynapseML will eventually make streaming the default.
-
-For bulk mode, native LightGBM Datasets can either be created per partition (useSingleDatasetMode=false), or
-per executor (useSingleDatasetMode=true). Generally, one Dataset per executor is more efficient since it reduces LightGBM network size and complexity during training or fitting. It also avoids using slow network protocols on partitions
-that are actually on the same executor node.
-
-#### Streaming Execution Mode
-The "streaming" execution mode uses new native LightGBM APIs created just for SynapseML that don't require loading extra copies of the data into memory. In particular, data is passed directly
-from partitions to Datasets in small "micro-batches", similar to Spark streaming. The `microBatchSize` parameter controls the size of these micro-batches.
-Smaller micro-batch sizes reduce memory overhead, but larger sizes avoid overhead from repeatedly transferring data to the native layer. The default
-100, uses far less memory than bulk mode since only 100 rows of data will be loaded at a time. If your dataset has
-few columns, you can increase the batch size. Alternatively, if
-your dataset has a large number of columns you can decrease the micro-batch size to avoid OOM issues.
-
-These new streaming APIs in LightGBM are thread-safe, and allow all partitions in the same executor
-to push data into a shared Dataset in parallel. Because of this, streaming mode always uses the more efficient
-"useSingleDatasetMode=true", creating only one Dataset per executor.
-
-You can explicitly specify Execution Mode and MicroBatch size as parameters.
-
- val lgbm = new LightGBMClassifier()
- .setExecutionMode("streaming")
- .setMicroBatchSize(100)
- .setLabelCol(labelColumn)
- .setObjective("binary")
- ...
-
-
-For streaming mode, only one Dataset is created per partition, so *useSingleDataMode* has no effect. It's effectively always true.
-
-### Data Sampling
-
-In order for LightGBM algorithm to work, it must first create a set of bin boundaries for optimization. It does this calculation by
-first sampling the data before any training or inferencing starts. ([LightGBM docs](https://github.com/Microsoft/LightGBM)). The number of
-samples to use is set using *binSampleCount*, which must be a minimal percent of the data or LightGBM rejects it.
-
-For *bulk* mode, this sampling is automatically done over the entire data, and each executor uses its own partitions to calculate samples for only
-a subset of the features. This distributed sampling can have subtle effects since partitioning can affect the calculated bins.
-Also, all data is sampled no matter what.
-
-For *streaming* mode, there are more explicit user controls for this sampling, and it's all done from the driver.
-The *samplingMode* property controls the behavior. The efficiency of these methods increases from first to last.
-- *global* - Like bulk mode, the random sample is calculated by iterating over entire data (hence data is traversed twice)
-- *subset* - (default) Samples only from the first *samplingSubsetSize* elements. Assumes this subset is representative.
-- *fixed* - There's no random sample. The first *binSampleSize* rows are used. Assumes randomized data.
-For large row counts, *subset* and *fixed* modes can save a first iteration over the entire data.
-
-#### Reference Dataset
-The sampling of the data to calculate bin boundaries happens every *fit* call.
-If repeating a fit many times (for example, hyperparameter tuning), this calculation is duplicated effort.
-
-For *streaming* mode, there's an optimization that a client can set to use the previously calculated bin boundaries. The
-sampling calculation results in a *reference dataset*, which can be reused. After a fit, there will be a *referenceDataset* property
-on the estimator that was calculated and used for that fit. If that is set on the next estimator (or you reuse the same one),
-it will use that instead of resampling the data.
-
-```python
-from synapse.ml.lightgbm import LightGBMClassifier
-classifier = LightGBMClassifier(learningRate=0.3,
- numIterations=100,
- numLeaves=31)
-model1 = classifier.fit(train)
-
-classifier.learningRate = 0.4
-model2 = classifier.fit(train)
-```
-The 'model2' call to 'fit' doesn't resample the data and uses the same bin boundaries as 'model1'.
-
-*Caution*: Some parameters actually affect the bin boundary calculation and require the use of a new reference dataset every time.
-These parameters include *isEnableSparse*, *useMissing*, and *zeroAsMissing* that you can set from SynapseML. If you manually set
-some parameters with *passThroughArgs*, you should look at LightGBM docs to see if they affect bin boundaries. If you're setting
-any parameter that affects bin boundaries and reusing the same estimator, you should set referenceDataset to an empty array between calls.
-
-### Barrier Execution Mode
-
-By default LightGBM uses the regular spark paradigm for launching tasks and communicates with the driver to coordinate task execution.
-The driver thread aggregates all task host:port information and then communicates the full list back to the workers in order for NetworkInit to be called.
-This procedure requires the driver to know how many tasks there are, and a mismatch between the expected number of tasks and the actual number causes
-the initialization to deadlock.
-
-If you're experiencing network issues, you can try using Spark's *barrier* execution mode. SynapseML provides a `UseBarrierExecutionMode` flag,
-to use Apache Spark's `barrier()` stage to ensure all tasks execute at the same time.
-Barrier execution mode changes the logic to aggregate `host:port` information across all tasks in a synchronized way.
-To use it in scala, you can call setUseBarrierExecutionMode(true), for example:
-
- val lgbm = new LightGBMClassifier()
- .setLabelCol(labelColumn)
- .setObjective(binaryObjective)
- .setUseBarrierExecutionMode(true)
- ...
-
-Note: barrier execution mode can also cause complicated issues, so use it only if needed.
\ No newline at end of file
diff --git a/website/docs/features/onnx/about.md b/website/docs/features/onnx/about.md
deleted file mode 100644
index baec0d8e6c..0000000000
--- a/website/docs/features/onnx/about.md
+++ /dev/null
@@ -1,108 +0,0 @@
----
-title: ONNX model inferencing on Spark
-hide_title: true
-sidebar_label: About
-description: Learn how to use the ONNX model transformer to run inference for an ONNX model on Spark.
----
-
-# ONNX model inferencing on Spark
-
-## ONNX
-
-[ONNX](https://onnx.ai/) is an open format to represent both deep learning and traditional machine learning models. With ONNX, AI developers can more easily move models between state-of-the-art tools and choose the combination that is best for them.
-
-SynapseML now includes a Spark transformer to bring a trained ONNX model to Apache Spark, so you can run inference on your data with Spark's large-scale data processing power.
-
-## ONNXHub
-Although you can use your own local model, many popular existing models are provided through the ONNXHub. You can use
-a model's ONNXHub name (for example "MNIST") and download the bytes of the model, and some metadata about the model. You can also list
-available models, optionally filtering by name or tags.
-
-```scala
- // List models
- val hub = new ONNXHub()
- val models = hub.listModels(model = Some("mnist"), tags = Some(Seq("vision")))
-
- // Retrieve and transform with a model
- val info = hub.getModelInfo("resnet50")
- val bytes = hub.load(name)
- val model = new ONNXModel()
- .setModelPayload(bytes)
- .setFeedDict(Map("data" -> "features"))
- .setFetchDict(Map("rawPrediction" -> "resnetv24_dense0_fwd"))
- .setSoftMaxDict(Map("rawPrediction" -> "probability"))
- .setArgMaxDict(Map("rawPrediction" -> "prediction"))
- .setMiniBatchSize(1)
-
- val (probability, _) = model.transform({YOUR_DATAFRAME})
- .select("probability", "prediction")
- .as[(Vector, Double)]
- .head
-```
-
-## Usage
-
-1. Create a `com.microsoft.azure.synapse.ml.onnx.ONNXModel` object and use `setModelLocation` or `setModelPayload` to load the ONNX model.
-
- For example:
-
- ```scala
- val onnx = new ONNXModel().setModelLocation("/path/to/model.onnx")
- ```
-
- Optionally, create the model from the ONNXHub.
-
- ```scala
- val onnx = new ONNXModel().setModelPayload(hub.load("MNIST"))
- ```
-2. Use ONNX visualization tool (for example, [Netron](https://netron.app/)) to inspect the ONNX model's input and output nodes.
-
- ![Screenshot that illustrates an ONNX model's input and output nodes](https://mmlspark.blob.core.windows.net/graphics/ONNXModelInputsOutputs.png)
-
-3. Set the parameters properly to the `ONNXModel` object.
-
- The `com.microsoft.azure.synapse.ml.onnx.ONNXModel` class provides a set of parameters to control the behavior of the inference.
-
- | Parameter | Description | Default Value |
- |:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------|
- | feedDict | Map the ONNX model's expected input node names to the input DataFrame's column names. Make sure the input DataFrame's column schema matches with the corresponding input's shape of the ONNX model. For example, an image classification model may have an input node of shape `[1, 3, 224, 224]` with type Float. It's assumed that the first dimension (1) is the batch size. Then the input DataFrame's corresponding column's type should be `ArrayType(ArrayType(ArrayType(FloatType)))`. | None |
- | fetchDict | Map the output DataFrame's column names to the ONNX model's output node names. NOTE: If you put outputs that are intermediate in the model, transform will automatically slice at those outputs. See the section on [Slicing](#slicing). | None |
- | miniBatcher | Specify the MiniBatcher to use. | `FixedMiniBatchTransformer` with batch size 10 |
- | softMaxDict | A map between output DataFrame columns, where the value column will be computed from taking the softmax of the key column. If the 'rawPrediction' column contains logits outputs, then one can set softMaxDict to `Map("rawPrediction" -> "probability")` to obtain the probability outputs. | None |
- | argMaxDict | A map between output DataFrame columns, where the value column will be computed from taking the argmax of the key column. This parameter can be used to convert probability or logits output to the predicted label. | None |
- | deviceType | Specify a device type the model inference runs on. Supported types are: CPU or CUDA. If not specified, auto detection will be used. | None |
- | optimizationLevel | Specify the [optimization level](https://onnxruntime.ai/docs/resources/graph-optimizations.html#graph-optimization-levels) for the ONNX graph optimizations. Supported values are: `NO_OPT`, `BASIC_OPT`, `EXTENDED_OPT`, `ALL_OPT`. | `ALL_OPT` |
-
-4. Call `transform` method to run inference on the input DataFrame.
-
-## Model Slicing
-By default, an ONNX model is treated as a black box with inputs and outputs.
-If you want to use intermediate nodes of a model, you can slice the model at particular nodes. Slicing will create a new model,
-keeping only parts of the model that are needed for those nodes. This new model's outputs will be the outputs from
-the intermediate nodes. You can save the sliced model and use it to transform just like any other ONNXModel.
-
-This slicing feature is used implicitly by the ImageFeaturizer, which uses ONNX models. The OnnxHub manifest entry for each model
-includes which intermediate node outputs should be used for featurization, so the ImageFeaturizer will automatically slice at the correct nodes.
-
-The below example shows how to perform the slicing manually with a direct ONNXModel.
-
-```scala
- // create a df: Dataframe with image data
- val hub = new ONNXHub()
- val info = hub.getModelInfo("resnet50")
- val bytes = hub.load(name)
- val intermediateOutputName = "resnetv24_pool1_fwd"
- val slicedModel = new ONNXModel()
- .setModelPayload(bytes)
- .setFeedDict(Map("data" -> "features"))
- .setFetchDict(Map("rawFeatures" -> intermediateOutputName)) // automatic slicing based on fetch dictionary
- // -- or --
- // .sliceAtOutput(intermediateOutputName) // manual slicing
-
- val slicedModelDf = slicedModel.transform(df)
-```
-
-## Example
-
-- [Interpretability - Image Explainers](../../responsible_ai/Interpretability%20-%20Image%20Explainers)
-- [ONNX - Inference on Spark](../ONNX%20-%20Inference%20on%20Spark)
diff --git a/website/docs/features/responsible_ai/Model Interpretation on Spark.md b/website/docs/features/responsible_ai/Model Interpretation on Spark.md
deleted file mode 100644
index 93dbc54cef..0000000000
--- a/website/docs/features/responsible_ai/Model Interpretation on Spark.md
+++ /dev/null
@@ -1,174 +0,0 @@
----
-title: Model Interpretation on Spark
-hide_title: true
-sidebar_label: Model Interpretation on Spark
----
-
-# Model Interpretation on Spark
-
-## Interpretable Machine Learning
-
-Interpretable Machine Learning helps developers, data scientists and business stakeholders in the organization gain a comprehensive understanding of their machine learning models. It can also be used to debug models, explain predictions and enable auditing to meet compliance with regulatory requirements.
-
-## Why run model interpretation on Spark
-
-Model-agnostic interpretation methods can be computationally expensive due to the multiple evaluations needed to compute the explanations. Model interpretation on Spark enables users to interpret a black-box model at massive scales with the Apache Spark™ distributed computing ecosystem. Various components support local interpretation for tabular, vector, image and text classification models, with two popular model-agnostic interpretation methods: [LIME] and [Kernel SHAP].
-
-[LIME]: https://arxiv.org/abs/1602.04938
-
-[Kernel SHAP]: https://arxiv.org/abs/1705.07874
-
-## Usage
-
-Both LIME and Kernel SHAP are local interpretation methods. Local interpretation explains why does the model predict certain outcome for a given observation.
-
-Both explainers extends from `org.apache.spark.ml.Transformer`. After setting up the explainer parameters, simply call the `transform` function on a `DataFrame` of observations to interpret the model behavior on these observations.
-
-To see examples of model interpretability on Spark in action, take a look at these sample notebooks:
-
-- [Tabular SHAP explainer](../../../features/responsible_ai/Interpretability%20-%20Tabular%20SHAP%20explainer)
-- [Image explainers](../../../features/responsible_ai/Interpretability%20-%20Image%20Explainers)
-- [Text explainers](../../../features/responsible_ai/Interpretability%20-%20Text%20Explainers)
-
-| | Tabular models | Vector models | Image models | Text models |
-|------------------------|-----------------------------|---------------------------|-------------------------|-----------------------|
-| LIME explainers | [TabularLIME](#tabularlime) | [VectorLIME](#vectorlime) | [ImageLIME](#imagelime) | [TextLIME](#textlime) |
-| Kernel SHAP explainers | [TabularSHAP](#tabularshap) | [VectorSHAP](#vectorshap) | [ImageSHAP](#imageshap) | [TextSHAP](#textshap) |
-
-### Common local explainer params
-
-All local explainers support the following params:
-
-| Param | Type | Default | Description |
-|------------------|---------------|---------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| targetCol | `String` | "probability" | The column name of the prediction target to explain (i.e. the response variable). This is usually set to "prediction" for regression models and "probability" for probabilistic classification models. |
-| targetClasses | `Array[Int]` | empty array | The indices of the classes for multinomial classification models. |
-| targetClassesCol | `String` | | The name of the column that specifies the indices of the classes for multinomial classification models. |
-| outputCol | `String` | | The name of the output column for interpretation results. |
-| model | `Transformer` | | The model to be explained. |
-
-### Common LIME explainer params
-
-All LIME based explainers ([TabularLIME](#tabularlime), [VectorLIME](#vectorlime), [ImageLIME](#imagelime), [TextLIME](#textlime)) support the following params:
-
-| Param | Type | Default | Description |
-|----------------|----------|---------------------------------|-----------------------------------------------------------|
-| regularization | `Double` | 0 | Regularization param for the underlying lasso regression. |
-| kernelWidth | `Double` | sqrt(number of features) * 0.75 | Kernel width for the exponential kernel. |
-| numSamples | `Int` | 1000 | Number of samples to generate. |
-| metricsCol | `String` | "r2" | Column name for fitting metrics. |
-
-### Common SHAP explainer params
-
-All Kernel SHAP based explainers ([TabularSHAP](#tabularshap), [VectorSHAP](#vectorshap), [ImageSHAP](#imageshap), [TextSHAP](#textshap)) support the following params:
-
-| Param | Type | Default | Description |
-|------------|----------|---------------------------------|------------------------------------------------|
-| infWeight | `Double` | 1E8 | The double value to represent infinite weight. |
-| numSamples | `Int` | 2 * (number of features) + 2048 | Number of samples to generate. |
-| metricsCol | `String` | "r2" | Column name for fitting metrics. |
-
-### Tabular model explainer params
-
-All tabular model explainers ([TabularLIME](#tabularlime), [TabularSHAP](#tabularshap)) support the following params:
-
-| Param | Type | Default | Description |
-|----------------|-----------------|---------|--------------------------------------------------------------------------------------------------------------|
-| inputCols | `Array[String]` | | The names of input columns to the black-box model. |
-| backgroundData | `DataFrame` | | A dataframe containing background data. It must contain all the input columns needed by the black-box model. |
-
-### Vector model explainer params
-
-All vector model explainers ([VectorLIME](#vectorlime), [VectorSHAP](#vectorshap)) support the following params:
-
-| Param | Type | Default | Description |
-|----------------|-------------|---------|----------------------------------------------------------------------------------------------------------------|
-| inputCol | `String` | | The names of input vector column to the black-box model. |
-| backgroundData | `DataFrame` | | A dataframe containing background data. It must contain the input vector column needed by the black-box model. |
-
-### Image model explainer params
-
-All image model explainers ([ImageLIME](#imagelime), [ImageSHAP](#imageshap)) support the following params:
-
-| Param | Type | Default | Description |
-|---------------|----------|---------------|--------------------------------------------------------------------|
-| inputCol | `String` | | The names of input image column to the black-box model. |
-| cellSize | `Double` | 16 | Number that controls the size of the super-pixels. |
-| modifier | `Double` | 130 | Controls the trade-off spatial and color distance of super-pixels. |
-| superpixelCol | `String` | "superpixels" | The column holding the super-pixel decompositions. |
-
-### Text model explainer params
-
-All text model explainers ([TextLIME](#textlime), [TextSHAP](#textshap)) support the following params:
-
-| Param | Type | Default | Description |
-|-----------|----------|----------|--------------------------------------------------------|
-| inputCol | `String` | | The names of input text column to the black-box model. |
-| tokensCol | `String` | "tokens" | The column holding the text tokens. |
-
-### `TabularLIME`
-
-| Param | Type | Default | Description |
-|---------------------|-----------------|-------------|----------------------------------------------------------------------|
-| categoricalFeatures | `Array[String]` | empty array | The name of columns that should be treated as categorical variables. |
-
-> For categorical features, `TabularLIME` creates new samples by drawing samples based on the value distribution from the background dataset. For numerical features, it creates new samples by drawing from a normal distribution with mean taken from the target value to be explained, and standard deviation taken from the background dataset.
-
-### `TabularSHAP`
-
-No additional params are supported.
-
-### `VectorLIME`
-
-No additional params are supported.
-
-> `VectorLIME` assumes all features are numerical, and categorical features are not supported in `VectorLIME`.
-
-### `VectorSHAP`
-
-No additional params are supported.
-
-### `ImageLIME`
-
-| Param | Type | Default | Description |
-|------------------|----------|---------|----------------------------------------------------------|
-| samplingFraction | `Double` | 0.7 | The fraction of super-pixels to keep on during sampling. |
-
-> `ImageLIME` creates new samples by randomly turning super-pixels on or off with probability of keeping on set to `SamplingFraction`.
-
-### `ImageSHAP`
-
-No additional params are supported.
-
-### `TextLIME`
-
-| Param | Type | Default | Description |
-|------------------|----------|---------|---------------------------------------------------------|
-| samplingFraction | `Double` | 0.7 | The fraction of word tokens to keep on during sampling. |
-
-> `TextLIME` creates new samples by randomly turning word tokens on or off with probability of keeping on set to `SamplingFraction`.
-
-### `TextSHAP`
-
-No additional params are supported.
-
-## Result interpretation
-
-### LIME explainers
-
-LIME explainers return an array of vectors, and each vector maps to a class being explained. Each component of the vector is the coefficient for the corresponding feature, super-pixel, or word token from the local surrogate model.
-
-- For categorical variables, super-pixels, or word tokens, the coefficient shows the average change in model outcome if this feature is unknown to the model, if the super-pixel is replaced with background color (black), or if the word token is replaced with empty string.
-- For numeric variables, the coefficient shows the change in model outcome if the feature value is incremented by 1 unit.
-
-### SHAP explainers
-
-SHAP explainers return an array of vectors, and each vector maps to a class being explained. Each vector starts with the [base value](#base-value), and each following component of the vector is the Shapley value for each feature, super-pixel, or token.
-
-The base value and Shapley values are additive, and they should add up to the model output for the target observation.
-
-#### Base value
-
-- For tabular and vector models, the base value represents the mean outcome of the model for the background dataset.
-- For image models, the base value represents the model outcome for a background (all black) image.
-- For text models, the base value represents the model outcome for an empty string.
diff --git a/website/docs/features/spark_serving/about.md b/website/docs/features/spark_serving/about.md
deleted file mode 100644
index 1aaeadde49..0000000000
--- a/website/docs/features/spark_serving/about.md
+++ /dev/null
@@ -1,228 +0,0 @@
----
-title: Spark Serving
-hide_title: true
-sidebar_label: About
----
-
-
-
-# Spark Serving
-
-### An Engine for Deploying Spark Jobs as Distributed Web Services
-
-- **Distributed**: Takes full advantage of Node, JVM, and thread level
- parallelism that Spark is famous for.
-- **Fast**: No single node bottlenecks, no round trips to Python.
- Requests can be routed directly to and from worker JVMs through
- network switches. Spin up a web service in a matter of seconds.
-- **Low Latency**: When using continuous serving,
- you can achieve latencies as low as 1 millisecond.
-- **Deployable Anywhere**: Works anywhere that runs Spark such as
- Databricks, HDInsight, AZTK, DSVMs, local, or on your own
- cluster. Usable from Spark, PySpark, and SparklyR.
-- **Lightweight**: No dependence on costly Kafka or
- Kubernetes clusters.
-- **Idiomatic**: Uses the same API as batch and structured streaming.
-- **Flexible**: Spin up and manage several services on a single Spark
- cluster. Synchronous and Asynchronous service management and
- extensibility. Deploy any spark job that is expressible as a
- structured streaming query. Use serving sources/sinks with other
- Spark data sources/sinks for more complex deployments.
-
-## Usage
-
-### Jupyter Notebook Examples
-
-- [Deploy a classifier trained on the Adult Census Dataset](../SparkServing%20-%20Deploying%20a%20Classifier)
-- More coming soon!
-
-### Spark Serving Hello World
-
-```python
-import synapse.ml
-import pyspark
-from pyspark.sql.functions import udf, col, length
-from pyspark.sql.types import *
-
-df = spark.readStream.server() \
- .address("localhost", 8888, "my_api") \
- .load() \
- .parseRequest(StructType().add("foo", StringType()).add("bar", IntegerType()))
-
-replies = df.withColumn("fooLength", length(col("foo")))\
- .makeReply("fooLength")
-
-server = replies\
- .writeStream \
- .server() \
- .replyTo("my_api") \
- .queryName("my_query") \
- .option("checkpointLocation", "file:///path/to/checkpoints") \
- .start()
-```
-
-### Deploying a Deep Network with the CNTKModel
-
-```python
-import synapse.ml
-from synapse.ml.cntk import CNTKModel
-import pyspark
-from pyspark.sql.functions import udf, col
-
-df = spark.readStream.server() \
- .address("localhost", 8888, "my_api")
- .load()
- .parseRequest()
-
-# See notebook examples for how to create and save several
-# examples of CNTK models
-network = CNTKModel.load("file:///path/to/my_cntkmodel.mml")
-
-transformed_df = network.transform(df).makeReply()
-
-server = transformed_df \
- .writeStream \
- .server() \
- .replyTo("my_api") \
- .queryName("my_query") \
- .option("checkpointLocation", "file:///path/to/checkpoints") \
- .start()
-```
-
-## Architecture
-
-Spark Serving adds special streaming sources and sinks to turn any
-structured streaming job into a web service. Spark Serving comes
-with two deployment options that vary based on what form of load balancing
-is being used.
-
-In brief you can use:
-`spark.readStream.server()`: For head node load balanced services
-`spark.readStream.distributedServer()`: For custom load balanced services
-`spark.readStream.continuousServer()`: For a custom load balanced, submillisecond-latency continuous server
-
-to create the various different serving dataframes and use the equivalent statements after `df.writeStream`
-for replying to the web requests.
-
-### Head Node Load Balanced
-
-You can deploy head node load balancing with the `HTTPSource` and
-`HTTPSink` classes. This mode spins up a queue on the head node,
-distributes work across partitions, then collects response data back to
-the head node. All HTTP requests are kept and replied to on the head
-node. In both python and Scala these classes can be access by using
-`spark.readStream.server()` after importing SynapseML.
-This mode allows for more complex windowing, repartitioning, and
-SQL operations. This option is also idea for rapid setup and testing,
-as it doesn't require any further load balancing or network
-switches. A diagram of this configuration can be seen in this image:
-
-
-
-
-
-### Fully Distributed (Custom Load Balancer)
-
-You can configure Spark Serving for a custom load balancer using the
-`DistributedHTTPSource` and `DistributedHTTPSink` classes. This mode
-spins up servers on each executor JVM.
-In both python and Scala these classes can be access by using
-`spark.readStream.distributedServer()` after importing SynapseML.
-Each server will feed its
-executor's partitions in parallel. This mode is key for high throughput
-and low latency as data doesn't need to be transferred to and from the
-head node. This deployment results in several web services that all
-route into the same spark computation. You can deploy an external load
-balancer to unify the executor's services under a single IP address.
-Support for automatic load balancer management and deployment is
-targeted for the next release of SynapseML. A diagram of this
-configuration can be seen here:
-
-
-
-
-
-Queries that involve data movement across workers, such as a nontrivial
-SQL join, need special consideration. The user must ensure that the
-right machine replies to each request. One can route data back to the
-originating partition with a broadcast join. In the future, request
-routing will be automatically handled by the sink.
-
-### Sub-Millisecond Latency with Continuous Processing
-
-
-
-
-
-Continuous processing can be enabled by hooking into the `HTTPSourceV2` class using:
-
- spark.readStream.continuousServer()
- ...
-
-In continuous serving, much like continuous streaming you need to add a trigger to your write statement:
-
- df.writeStream
- .continuousServer()
- .trigger(continuous="1 second")
- ...
-
-The architecture is similar to the custom load balancer setup described earlier.
-More specifically, Spark will manage a web service on each partition.
-These webservices can be unified together using an Azure Load Balancer,
-Kubernetes Service Endpoint, Azure Application gateway or any other way to load balance a distributed service.
-It's currently the user's responsibility to optionally unify these services as they see fit.
-In the future, we'll include options to dynamically spin up and manage a load balancer.
-
-#### Databricks Setup
-
-Databricks is a managed architecture and they've restricted
-all incoming traffic to the nodes of the cluster.
-If you create a web service in your databricks cluster (head or worker nodes),
-your cluster can communicate with the service, but the outside world can't.
-However, in the future, Databricks will support Virtual Network Injection, so problem will not arise.
-In the meantime, you must use SSH tunneling to forward the services to another machine(s)
-to act as a networking gateway. This machine can be any machine that accepts SSH traffic and requests.
-We have included settings to automatically configure this SSH tunneling for convenience.
-
-##### Linux Gateway Setup - Azure
-
-1. [Create a Linux VM using SSH](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/quick-create-portal)
-2. [Open ports 8000-9999 from the Azure portal](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/nsg-quickstart-portal)
-3. Open the port on the firewall on the VM
- ```$xslt
- firewall-cmd --zone=public --add-port=8000-10000/tcp --permanent
- firewall-cmd --reload
- echo "GatewayPorts yes" >> /etc/ssh/sshd_config
- service ssh --full-restart
- ```
-4. Add your private key to a private container in [Azure Storage Blob](https://docs.microsoft.com/en-us/azure/storage/common/storage-quickstart-create-account?toc=%2Fazure%2Fstorage%2Fblobs%2Ftoc.json&tabs=portal).
-5. Generate a SAS link for your key and save it.
-6. Include the following parameters on your reader to configure the SSH tunneling:
- serving_inputs = (spark.readStream.continuousServer()
- .option("numPartitions", 1)
- .option("forwarding.enabled", True) # enable ssh forwarding to a gateway machine
- .option("forwarding.username", "username")
- .option("forwarding.sshHost", "ip or dns")
- .option("forwarding.keySas", "SAS url from the previous step")
- .address("localhost", 8904, "my_api")
- .load()
-
-This setup will make your service require an extra jump and affect latency.
-It's important to pick a gateway that has good connectivity to your spark cluster.
-For best performance and ease of configuration, we suggest using Spark Serving
-on an open cluster environment such as Kubernetes, Mesos, or Azure Batch.
-
-
-## Parameters
-
-| Parameter Name | Description | Necessary | Default Value | Applicable When |
-| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------------- | ----------------------------------------------------------------------------------------------------- |
-| host | The host to spin up a server on | Yes | | |
-| port | The starting port when creating the web services. Web services will increment this port several times to find an open port. In the future, the flexibility of this param will be expanded | yes | | |
-| name | The Path of the api a user would call. The format is `hostname:port/name` | yes | | |
-| forwarding.enabled | Whether to forward the services to a gateway machine | no | false | When you need to forward services out of a protected network. Only Supported for Continuous Serving. |
-| forwarding.username | the username to connect to on the remote host | no | | |
-| forwarding.sshport | the port to ssh connect to | no | 22 | |
-| forwarding.sshHost | the host of the gateway machine | no | | |
-| forwarding.keySas | A Secure access link that can be used to automatically download the required ssh private key | no | | Sometimes more convenient than a directory |
-| forwarding.keyDir | A directory on the machines holding the private key | no | "~/.ssh" | Useful if you can't send keys over the wire securely |
diff --git a/website/docs/features/vw/about.md b/website/docs/features/vw/about.md
deleted file mode 100644
index ac0f56ff2f..0000000000
--- a/website/docs/features/vw/about.md
+++ /dev/null
@@ -1,112 +0,0 @@
----
-title: VW
-hide_title: true
-sidebar_label: About
----
-
-
-
-# VowpalWabbit on Apache Spark
-
-### Overview
-
-[VowpalWabbit](https://github.com/VowpalWabbit/vowpal_wabbit) (VW) is a machine learning system that
-pushes the frontier of machine learning with techniques such as online, hashing, allreduce,
-reductions, learning2search, active, and interactive learning.
-VowpalWabbit is a popular choice in ad-tech due to its speed and cost efficacy.
-Furthermore it includes many advances in the area of reinforcement learning (for instance, contextual bandits).
-
-### Advantages of VowpalWabbit
-
-- **Composability**: VowpalWabbit models can be incorporated into existing
- SparkML Pipelines, and used for batch, streaming, and serving workloads.
-- **Small footprint**: VowpalWabbit memory consumption is rather small and can be controlled through '-b 18' or the setNumBits method.
- This option determines the size of the model (2^18 * some_constant, in this example).
-- **Feature Interactions**: Feature interactions (quadratic, cubic,... terms, for instance) are created on-the-fly within the most inner
- learning loop in VW.
- Interactions can be specified by using the -q parameter and passing the first character of the namespaces that should be _interacted_.
- The VW namespace concept is mapped to Spark using columns. The column name is used as namespace name, thus one sparse or dense Spark ML vector corresponds to the features of a single namespace.
- To allow passing of multiple namespaces, the VW estimator (classifier or regression) exposes a property called _additionalFeatures_. Users can pass an array of column names.
-- **Simple deployment**: all native dependencies are packaged into a single jars (including boost and zlib).
-- **VowpalWabbit command line arguments**: users can pass VW command line arguments to control the learning process.
-- **VowpalWabbit binary models** To start the training, users can supply an initial VowpalWabbit model, which can be produced outside of
- VW on Spark, by invoking _setInitialModel_ and passing the model as a byte array. Similarly, users can access the binary model by invoking
- _getModel_ on the trained model object.
-- **Java-based hashing** VW's version of murmur-hash was reimplemented in Java (praise to [JackDoe](https://github.com/jackdoe))
- providing a major performance improvement compared to passing input strings through JNI and hashing in C++.
-- **Cross language** VowpalWabbit on Spark is available on Spark, PySpark, and SparklyR.
-
-### Limitations of VowpalWabbit on Spark
-
-- **Linux and CentOS only** The native binaries included with the published jar are built Linux and CentOS only.
- We're working on creating a more portable version by statically linking Boost and lib C++.
-- **Limited Parsing** Features implemented in the native VW parser (ngrams, skips, ...) are not yet implemented in
- VowpalWabbitFeaturizer.
-
-### Usage
-
-In PySpark, you can run the `VowpalWabbitClassifier` via:
-
-```python
-from synapse.ml.vw import VowpalWabbitClassifier
-model = (VowpalWabbitClassifier(numPasses=5, args="--holdout_off --loss_function logistic")
- .fit(train))
-```
-
-Similarly, you can run the `VowpalWabbitRegressor`:
-
-```python
-from synapse.ml.vw import VowpalWabbitRegressor
-model = (VowpalWabbitRegressor(args="--holdout_off --loss_function quantile -q :: -l 0.1")
- .fit(train))
-```
-
-You can pass command line parameters to VW via the args parameter, as documented in the [VW Wiki](https://github.com/vowpalWabbit/vowpal_wabbit/wiki/Command-Line-Arguments).
-
-For an end to end application, check out the VowpalWabbit [notebook
-example](../Vowpal%20Wabbit%20-%20Overview).
-
-### Hyper-parameter tuning
-
-- Common parameters can also be set through methods enabling the use of SparkMLs ParamGridBuilder and CrossValidator ([example](https://github.com/Azure/mmlspark/blob/master/src/test/scala/com/microsoft/azure/synapse/ml/vw/VerifyVowpalWabbitClassifier.scala#L29)). If
- the same parameters are passed through the _args_ property (for instance, args="-l 0.2" and setLearningRate(0.5)) the _args_ value will
- take precedence.
- parameter
-* learningRate
-* numPasses
-* numBits
-* l1
-* l2
-* powerT
-* interactions
-* ignoreNamespaces
-
-### Architecture
-
-VowpalWabbit on Spark uses an optimized JNI layer to efficiently support Spark.
-Java bindings can be found in the [VW GitHub repo](https://github.com/VowpalWabbit/vowpal_wabbit/blob/master/java/src/main/c%2B%2B/jni_spark_vw_generated.h).
-
-VW's command line tool uses a two-thread architecture (1x parsing/hashing, 1x learning) for learning and inference.
-To fluently embed VW into the Spark ML eco system, the following adaptions were made:
-
-- VW classifier/regressor operates on Spark's dense/sparse vectors
- - Pro: best composability with existing Spark ML components.
- - Cons: due to type restrictions (for example, feature indices are Java integers), the maximum model size is limited to 30 bits. One could overcome this restriction by adding type support to the classifier/regressor to directly operate on input features (strings, int, double, ...).
-
-- VW hashing is separated out into the [VowpalWabbitFeaturizer](https://github.com/Azure/mmlspark/blob/master/src/test/scala/com/microsoft/azure/synapse/ml/vw/VerifyVowpalWabbitFeaturizer.scala#L34) transformer. It supports mapping Spark Dataframe schema into VW's namespaces and sparse
-features.
- - Pro: featurization can be scaled to many nodes, scale independent of distributed learning.
- - Pro: hashed features can be cached and efficiently reused when performing hyper-parameter sweeps.
- - Pro: featurization can be used for other Spark ML learning algorithms.
- - Cons: due to type restrictions (for instance, sparse indices are Java integers) the hash space is limited to 30 bits.
-
-- VW multi-pass training can be enabled using '--passes 4' argument or setNumPasses method. Cache file is automatically named.
- - Pro: simplified usage.
- - Pro: certain algorithms (for example, l-bfgs) require a cache file when running in multi-pass node.
- - Cons: Since the cache file resides in the Java temp directory, a bottleneck may arise, depending on your node's I/O performance and the location of the temp directory.
-- VW distributed training is transparently set up and can be controlled through the input dataframes number of partitions.
- Similar to LightGBM all training instances must be running at the same time, thus the maximum parallelism is restricted by the
- number of executors available in the cluster. Under the hood, VW's built-in spanning tree functionality is used to coordinate _allreduce_.
- Required parameters are automatically determined and supplied to VW. The spanning tree coordination process is run on the driver node.
- - Pro: seamless parallelization.
- - Cons: currently barrier execution mode isn't implemented and thus if one node crashes the complete job needs to be manually restarted.
diff --git a/website/docs/getting_started/dotnet_example.md b/website/docs/getting_started/dotnet_example.md
deleted file mode 100644
index fd56c5a83f..0000000000
--- a/website/docs/getting_started/dotnet_example.md
+++ /dev/null
@@ -1,126 +0,0 @@
----
-title: .NET Example with LightGBMClassifier
-sidebar_label: .NET example
-description: A simple example about classification with LightGBMClassifier using .NET
----
-
-:::note
-Make sure you have followed the guidance in [.NET installation](../reference/dotnet-setup.md) before jumping into this example.
-:::
-
-## Classification with LightGBMClassifier
-
-Install NuGet packages by running following command:
-```powershell
-dotnet add package Microsoft.Spark --version 2.1.1
-dotnet add package SynapseML.Lightgbm --version 0.11.2
-dotnet add package SynapseML.Core --version 0.11.2
-```
-
-Use the following code in your main program file:
-```csharp
-using System;
-using System.Collections.Generic;
-using Synapse.ML.Lightgbm;
-using Synapse.ML.Featurize;
-using Microsoft.Spark.Sql;
-using Microsoft.Spark.Sql.Types;
-
-namespace SynapseMLApp
-{
- class Program
- {
- static void Main(string[] args)
- {
- // Create Spark session
- SparkSession spark =
- SparkSession
- .Builder()
- .AppName("LightGBMExample")
- .GetOrCreate();
-
- // Load Data
- DataFrame df = spark.Read()
- .Option("inferSchema", true)
- .Parquet("wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet")
- .Limit(2000);
-
- var featureColumns = new string[] {"age", "workclass", "fnlwgt", "education", "education-num",
- "marital-status", "occupation", "relationship", "race", "sex", "capital-gain",
- "capital-loss", "hours-per-week", "native-country"};
-
- // Transform features
- var featurize = new Featurize()
- .SetOutputCol("features")
- .SetInputCols(featureColumns)
- .SetOneHotEncodeCategoricals(true)
- .SetNumFeatures(14);
-
- var dfTrans = featurize
- .Fit(df)
- .Transform(df)
- .WithColumn("label", Functions.When(Functions.Col("income").Contains("<"), 0.0).Otherwise(1.0));
-
- DataFrame[] dfs = dfTrans.RandomSplit(new double[] {0.75, 0.25}, 123);
- var trainDf = dfs[0];
- var testDf = dfs[1];
-
- // Create LightGBMClassifier
- var lightGBMClassifier = new LightGBMClassifier()
- .SetFeaturesCol("features")
- .SetRawPredictionCol("rawPrediction")
- .SetObjective("binary")
- .SetNumLeaves(30)
- .SetNumIterations(200)
- .SetLabelCol("label")
- .SetLeafPredictionCol("leafPrediction")
- .SetFeaturesShapCol("featuresShap");
-
- // Fit the model
- var lightGBMClassificationModel = lightGBMClassifier.Fit(trainDf);
-
- // Apply transformation and displayresults
- lightGBMClassificationModel.Transform(testDf).Show(50);
-
- // Stop Spark session
- spark.Stop();
- }
- }
-}
-```
-
-Run `dotnet build` to build the project. Then navigate to build output directory, and run following command:
-```powershell
-spark-submit --class org.apache.spark.deploy.dotnet.DotnetRunner --packages com.microsoft.azure:synapseml_2.12:0.11.2,org.apache.hadoop:hadoop-azure:3.3.1 --master local microsoft-spark-3-2_2.12-2.1.1.jar dotnet SynapseMLApp.dll
-```
-:::note
-Here we added two packages: synapseml_2.12 for SynapseML's scala source, and hadoop-azure to support reading files from ADLS.
-:::
-
-Expected output:
-```
-+---+---------+------+-------------+-------------+--------------+------------------+---------------+-------------------+-------+------------+------------+--------------+--------------+------+--------------------+-----+--------------------+--------------------+----------+--------------------+--------------------+
-|age|workclass|fnlwgt| education|education-num|marital-status| occupation| relationship| race| sex|capital-gain|capital-loss|hours-per-week|native-country|income| features|label| rawPrediction| probability|prediction| leafPrediction| featuresShap|
-+---+---------+------+-------------+-------------+--------------+------------------+---------------+-------------------+-------+------------+------------+--------------+--------------+------+--------------------+-----+--------------------+--------------------+----------+--------------------+--------------------+
-| 17| ?|634226| 10th| 6| Never-married| ?| Own-child| White| Female| 0| 0| 17.0| United-States| <=50K|(61,[7,9,11,15,20...| 0.0|[9.37122343731523...|[0.99991486808581...| 0.0|[0.0,0.0,0.0,0.0,...|[-0.0560742274706...|
-| 17| Private| 73145| 9th| 5| Never-married| Craft-repair| Own-child| White| Female| 0| 0| 16.0| United-States| <=50K|(61,[7,9,11,15,17...| 0.0|[12.7512760001880...|[0.99999710138899...| 0.0|[0.0,0.0,0.0,0.0,...|[-0.1657810433238...|
-| 17| Private|150106| 10th| 6| Never-married| Sales| Own-child| White| Female| 0| 0| 20.0| United-States| <=50K|(61,[5,9,11,15,17...| 0.0|[12.7676985938038...|[0.99999714860282...| 0.0|[0.0,0.0,0.0,0.0,...|[-0.1276877355292...|
-| 17| Private|151141| 11th| 7| Never-married| Handlers-cleaners| Own-child| White| Male| 0| 0| 15.0| United-States| <=50K|(61,[8,9,11,15,17...| 0.0|[12.1656242513070...|[0.99999479363924...| 0.0|[0.0,0.0,0.0,0.0,...|[-0.1279828578119...|
-| 17| Private|327127| 11th| 7| Never-married| Transport-moving| Own-child| White| Male| 0| 0| 20.0| United-States| <=50K|(61,[1,9,11,15,17...| 0.0|[12.9962776686392...|[0.99999773124636...| 0.0|[0.0,0.0,0.0,0.0,...|[-0.1164691543415...|
-| 18| ?|171088| Some-college| 10| Never-married| ?| Own-child| White| Female| 0| 0| 40.0| United-States| <=50K|(61,[7,9,11,15,20...| 0.0|[12.9400428266629...|[0.99999760000817...| 0.0|[0.0,0.0,0.0,0.0,...|[-0.1554829578661...|
-| 18| Private|115839| 12th| 8| Never-married| Adm-clerical| Not-in-family| White| Female| 0| 0| 30.0| United-States| <=50K|(61,[0,9,11,15,17...| 0.0|[11.8393032168619...|[0.99999278472630...| 0.0|[0.0,0.0,0.0,0.0,...|[0.44080835709189...|
-| 18| Private|133055| HS-grad| 9| Never-married| Other-service| Own-child| White| Female| 0| 0| 30.0| United-States| <=50K|(61,[3,9,11,15,17...| 0.0|[11.5747235180479...|[0.99999059936124...| 0.0|[0.0,0.0,0.0,0.0,...|[-0.1415862541824...|
-| 18| Private|169745| 7th-8th| 4| Never-married| Other-service| Own-child| White| Female| 0| 0| 40.0| United-States| <=50K|(61,[3,9,11,15,17...| 0.0|[11.8316427733613...|[0.99999272924226...| 0.0|[0.0,0.0,0.0,0.0,...|[-0.1527378526573...|
-| 18| Private|177648| HS-grad| 9| Never-married| Sales| Own-child| White| Female| 0| 0| 25.0| United-States| <=50K|(61,[5,9,11,15,17...| 0.0|[10.0820248199174...|[0.99995817710510...| 0.0|[0.0,0.0,0.0,0.0,...|[-0.1151843103241...|
-| 18| Private|188241| 11th| 7| Never-married| Other-service| Own-child| White| Male| 0| 0| 16.0| United-States| <=50K|(61,[3,9,11,15,17...| 0.0|[10.4049945509280...|[0.99996972005153...| 0.0|[0.0,0.0,0.0,0.0,...|[-0.1356854966291...|
-| 18| Private|200603| HS-grad| 9| Never-married| Adm-clerical| Other-relative| White| Female| 0| 0| 30.0| United-States| <=50K|(61,[0,9,11,15,17...| 0.0|[12.1354343020828...|[0.99999463406365...| 0.0|[0.0,0.0,0.0,0.0,...|[0.53241098695335...|
-| 18| Private|210026| 10th| 6| Never-married| Other-service| Other-relative| White| Female| 0| 0| 40.0| United-States| <=50K|(61,[3,9,11,15,17...| 0.0|[12.3692360082180...|[0.99999575275599...| 0.0|[0.0,0.0,0.0,0.0,...|[-0.1275208795564...|
-| 18| Private|447882| Some-college| 10| Never-married| Adm-clerical| Not-in-family| White| Female| 0| 0| 20.0| United-States| <=50K|(61,[0,9,11,15,17...| 0.0|[10.2514945786032...|[0.99996469655062...| 0.0|[0.0,0.0,0.0,0.0,...|[0.36497782752201...|
-| 19| ?|242001| Some-college| 10| Never-married| ?| Own-child| White| Female| 0| 0| 40.0| United-States| <=50K|(61,[7,9,11,15,20...| 0.0|[13.9439986622060...|[0.99999912057674...| 0.0|[0.0,0.0,0.0,0.0,...|[-0.1265631737386...|
-| 19| Private| 63814| Some-college| 10| Never-married| Adm-clerical| Not-in-family| White| Female| 0| 0| 18.0| United-States| <=50K|(61,[0,9,11,15,17...| 0.0|[10.2057742895673...|[0.99996304506073...| 0.0|[0.0,0.0,0.0,0.0,...|[0.77645146059597...|
-| 19| Private| 83930| HS-grad| 9| Never-married| Other-service| Own-child| White| Female| 0| 0| 20.0| United-States| <=50K|(61,[3,9,11,15,17...| 0.0|[10.4771335467356...|[0.99997182742919...| 0.0|[0.0,0.0,0.0,0.0,...|[-0.1625827100973...|
-| 19| Private| 86150| 11th| 7| Never-married| Sales| Own-child| Asian-Pac-Islander| Female| 0| 0| 19.0| Philippines| <=50K|(61,[5,9,14,15,17...| 0.0|[12.0241839747799...|[0.99999400263272...| 0.0|[0.0,0.0,0.0,0.0,...|[-0.1532111483051...|
-| 19| Private|189574| HS-grad| 9| Never-married| Other-service| Not-in-family| White| Female| 0| 0| 30.0| United-States| <=50K|(61,[3,9,11,15,17...| 0.0|[9.53742673004733...|[0.99992790305091...| 0.0|[0.0,0.0,0.0,0.0,...|[-0.0988907054317...|
-| 19| Private|219742| Some-college| 10| Never-married| Other-service| Own-child| White| Female| 0| 0| 15.0| United-States| <=50K|(61,[3,9,11,15,17...| 0.0|[12.8625329757574...|[0.99999740658642...| 0.0|[0.0,0.0,0.0,0.0,...|[-0.1922327651359...|
-+---+---------+------+-------------+-------------+--------------+------------------+---------------+-------------------+-------+------------+------------+--------------+--------------+------+--------------------+-----+--------------------+--------------------+----------+--------------------+--------------------+
-```
diff --git a/website/docs/getting_started/first_example.md b/website/docs/getting_started/first_example.md
deleted file mode 100644
index 8d73dda6bf..0000000000
--- a/website/docs/getting_started/first_example.md
+++ /dev/null
@@ -1,66 +0,0 @@
----
-title: First Example
-description: Build machine learning applications using Microsoft Machine Learning for Apache Spark
----
-
-## Prerequisites
-
-- If you don't have an Azure subscription, [create a free account before you begin](https://azure.microsoft.com/free/).
-- [Azure Synapse Analytics workspace](https://docs.microsoft.com/en-us/azure/synapse-analytics/get-started-create-workspace) with an Azure Data Lake Storage Gen2 storage account configured as the default storage. You need to be the _Storage Blob Data Contributor_ of the Data Lake Storage Gen2 file system that you work with.
-- Spark pool in your Azure Synapse Analytics workspace. For details, see [Create a Spark pool in Azure Synapse](https://docs.microsoft.com/en-us/azure/synapse-analytics/get-started-analyze-spark).
-- Pre-configuration steps described in the tutorial [Configure Cognitive Services in Azure Synapse](https://docs.microsoft.com/en-us/azure/synapse-analytics/machine-learning/tutorial-configure-cognitive-services-synapse).
-
-## Get started
-
-To get started, import synapse.ml and configurate service keys.
-
-```python
-import synapse.ml
-from synapse.ml.cognitive import *
-from notebookutils import mssparkutils
-
-# A general Cognitive Services key for Text Analytics and Computer Vision (or use separate keys that belong to each service)
-cognitive_service_key = mssparkutils.credentials.getSecret("ADD_YOUR_KEY_VAULT_NAME", "ADD_YOUR_SERVICE_KEY","ADD_YOUR_KEY_VAULT_LINKED_SERVICE_NAME")
-# A Bing Search v7 subscription key
-bingsearch_service_key = mssparkutils.credentials.getSecret("ADD_YOUR_KEY_VAULT_NAME", "ADD_YOUR_BING_SEARCH_KEY","ADD_YOUR_KEY_VAULT_LINKED_SERVICE_NAME")
-# An Anomaly Dectector subscription key
-anomalydetector_key = mssparkutils.credentials.getSecret("ADD_YOUR_KEY_VAULT_NAME", "ADD_YOUR_ANOMALY_KEY","ADD_YOUR_KEY_VAULT_LINKED_SERVICE_NAME")
-
-
-```
-
-## Text analytics sample
-
-The [Text Analytics](https://azure.microsoft.com/en-us/services/cognitive-services/text-analytics/) service provides several algorithms for extracting intelligent insights from text. For example, we can find the sentiment of given input text. The service will return a score between 0.0 and 1.0 where low scores indicate negative sentiment and high score indicates positive sentiment. This sample uses three simple sentences and returns the sentiment for each.
-
-```python
-from pyspark.sql.functions import col
-
-# Create a dataframe that's tied to it's column names
-df_sentences = spark.createDataFrame([
- ("I'm so happy today, it's sunny!", "en-US"),
- ("this is a dog", "en-US"),s
- ("I'm frustrated by this rush hour traffic!", "en-US")
-], ["text", "language"])
-
-# Run the Text Analytics service with options
-sentiment = (TextSentiment()
- .setTextCol("text")
- .setLocation("eastasia") # Set the location of your cognitive service
- .setSubscriptionKey(cognitive_service_key)
- .setOutputCol("sentiment")
- .setErrorCol("error")
- .setLanguageCol("language"))
-
-# Show the results of your text query in a table format
-
-display(sentiment.transform(df_sentences).select("text", col("sentiment")[0].getItem("sentiment").alias("sentiment")))
-```
-
-### Expected results
-
-| text | sentiment |
-| ------------------------------------------ | --------- |
-| I'm frustrated by this rush hour traffic! | negative |
-| this is a dog | neutral |
-| I'm so happy today, it's sunny! | positive |
diff --git a/website/docs/getting_started/first_model.md b/website/docs/getting_started/first_model.md
deleted file mode 100644
index b11797600f..0000000000
--- a/website/docs/getting_started/first_model.md
+++ /dev/null
@@ -1,117 +0,0 @@
----
-title: First Model
-hide_title: true
-description: First Model
----
-
-# Your First Model
-
-In this example, we construct a basic classification model to predict a person's
-income level given demographics data such as education level or marital status.
-We also learn how to use Jupyter notebooks for developing and running the model.
-
-### Prerequisites
-
-- You've installed the SynapseML package, either as a Docker image or on a
- Spark cluster,
-- You have basic knowledge of Python language,
-- You have basic understanding of machine learning concepts: training, testing,
- classification.
-
-### Working with Jupyter Notebooks
-
-Once you have the SynapseML package installed, open Jupyter notebooks folder in
-your web browser
-
-- Local Docker: `http://localhost:8888`
-- Spark cluster: `https:///jupyter`
-
-Create a new notebook by selecting "New" -> "PySpark3". Let's also give the
-notebook a friendlier name, _Adult Census Income Prediction_, by clicking the
-title.
-
-### Importing Packages and Starting the Spark Application
-
-At this point, the notebook isn't running a Spark application yet. In the
-first cell, let's import some needed packages
-
-```python
-import numpy as np
-import pandas as pd
-```
-
-Click the "run cell" button on the toolbar to start the application. After a
-few moments, you should see the message "SparkSession available as 'spark'".
-Now you're ready to start coding and running your application.
-
-### Reading in Data
-
-In a typical Spark application, you'd likely work with huge datasets stored on
-distributed file system, such as HDFS. However, to keep this tutorial simple
-and quick, we'll copy over a small dataset from a URL. We then read this data
-into memory using Pandas CSV reader, and distribute the data as a Spark
-DataFrame. Finally, we show the first 5 rows of the dataset. Copy the following
-code to the next cell in your notebook, and run the cell.
-
-```python
-dataFile = "AdultCensusIncome.csv"
-import os, urllib
-if not os.path.isfile(dataFile):
- urllib.request.urlretrieve("https://mmlspark.azureedge.net/datasets/" + dataFile, dataFile)
-data = spark.createDataFrame(pd.read_csv(dataFile, dtype={" hours-per-week": np.float64}))
-data.show(5)
-```
-
-### Selecting Features and Splitting Data to Train and Test Sets
-
-Next, select some features to use in our model. You can try out different
-features, but you should include `" income"` as it is the label column the model
-is trying to predict. We then split the data into a `train` and `test` sets.
-
-```python
-data = data.select([" education", " marital-status", " hours-per-week", " income"])
-train, test = data.randomSplit([0.75, 0.25], seed=123)
-```
-
-### Training a Model
-
-To train the classifier model, we use the `synapse.ml.TrainClassifier` class. It
-takes in training data and a base SparkML classifier, maps the data into the
-format expected by the base classifier algorithm, and fits a model.
-
-```python
-from synapse.ml.train import TrainClassifier
-from pyspark.ml.classification import LogisticRegression
-model = TrainClassifier(model=LogisticRegression(), labelCol=" income").fit(train)
-```
-
-`TrainClassifier` implicitly handles string-valued columns and
-binarizes the label column.
-
-### Scoring and Evaluating the Model
-
-Finally, let's score the model against the test set, and use
-`synapse.ml.ComputeModelStatistics` class to compute metrics—accuracy, AUC,
-precision, recall—from the scored data.
-
-```python
-from synapse.ml.train import ComputeModelStatistics
-prediction = model.transform(test)
-metrics = ComputeModelStatistics().transform(prediction)
-metrics.select('accuracy').show()
-```
-
-And that's it: you've build your first machine learning model using the SynapseML
-package. For help on SynapseML classes and methods, you can use Python's help()
-function, for example
-
-```python
-help(synapse.ml.train.TrainClassifier)
-```
-
-Next, view our other tutorials to learn how to
-
-- Tune model parameters to find the best model
-- Use SparkML pipelines to build a more complex model
-- Use deep neural networks for image classification
-- Use text analytics for document classification
diff --git a/website/docs/mlflow/autologging.md b/website/docs/mlflow/autologging.md
deleted file mode 100644
index 76149e72fb..0000000000
--- a/website/docs/mlflow/autologging.md
+++ /dev/null
@@ -1,84 +0,0 @@
----
-title: SynapseML Autologging
-description: SynapseML autologging
----
-
-## Automatic Logging
-
-[MLflow automatic logging](https://www.mlflow.org/docs/latest/tracking.html#automatic-logging) allows you to log metrics, parameters, and models without the need for explicit log statements.
-SynapseML supports autologging for every model in the library.
-
-To enable autologging for SynapseML:
-1. Download this customized [log_model_allowlist file](https://mmlspark.blob.core.windows.net/publicwasb/log_model_allowlist.txt) and put it at a place that your code has access to.
-For example:
-* In Synapse `wasb://@.blob.core.windows.net/PATH_TO_YOUR/log_model_allowlist.txt`
-* In Databricks `/dbfs/FileStore/PATH_TO_YOUR/log_model_allowlist.txt`.
-2. Set spark configuration `spark.mlflow.pysparkml.autolog.logModelAllowlistFile` to the path of your `log_model_allowlist.txt` file.
-3. Call `mlflow.pyspark.ml.autolog()` before your training code to enable autologging for all supported models.
-
-Note:
-1. If you want to support autologging of PySpark models not present in the log_model_allowlist file, you can add such models to the file.
-2. If you've enabled autologging, then don't write explicit `with mlflow.start_run()` as it might cause multiple runs for one single model or one run for multiple models.
-
-
-## Configuration process in Databricks as an example
-
-1. Install latest MLflow via `%pip install mlflow -u`
-2. Upload your customized `log_model_allowlist.txt` file to dbfs by clicking File/Upload Data button on Databricks UI.
-3. Set Cluster Spark configuration following [this documentation](https://docs.microsoft.com/en-us/azure/databricks/clusters/configure#spark-configuration)
-```
-spark.mlflow.pysparkml.autolog.logModelAllowlistFile /dbfs/FileStore/PATH_TO_YOUR/log_model_allowlist.txt
-```
-4. Run the following line before your training code executes.
-```
-mlflow.pyspark.ml.autolog()
-```
-You can customize how autologging works by supplying appropriate [parameters](https://www.mlflow.org/docs/latest/python_api/mlflow.pyspark.ml.html#mlflow.pyspark.ml.autolog).
-
-5. To find your experiment's results via the `Experiments` tab of the MLFlow UI.
-
-
-## Example for ConditionalKNNModel
-```python
-from pyspark.ml.linalg import Vectors
-from synapse.ml.nn import *
-
-df = spark.createDataFrame([
- (Vectors.dense(2.0,2.0,2.0), "foo", 1),
- (Vectors.dense(2.0,2.0,4.0), "foo", 3),
- (Vectors.dense(2.0,2.0,6.0), "foo", 4),
- (Vectors.dense(2.0,2.0,8.0), "foo", 3),
- (Vectors.dense(2.0,2.0,10.0), "foo", 1),
- (Vectors.dense(2.0,2.0,12.0), "foo", 2),
- (Vectors.dense(2.0,2.0,14.0), "foo", 0),
- (Vectors.dense(2.0,2.0,16.0), "foo", 1),
- (Vectors.dense(2.0,2.0,18.0), "foo", 3),
- (Vectors.dense(2.0,2.0,20.0), "foo", 0),
- (Vectors.dense(2.0,4.0,2.0), "foo", 2),
- (Vectors.dense(2.0,4.0,4.0), "foo", 4),
- (Vectors.dense(2.0,4.0,6.0), "foo", 2),
- (Vectors.dense(2.0,4.0,8.0), "foo", 2),
- (Vectors.dense(2.0,4.0,10.0), "foo", 4),
- (Vectors.dense(2.0,4.0,12.0), "foo", 3),
- (Vectors.dense(2.0,4.0,14.0), "foo", 2),
- (Vectors.dense(2.0,4.0,16.0), "foo", 1),
- (Vectors.dense(2.0,4.0,18.0), "foo", 4),
- (Vectors.dense(2.0,4.0,20.0), "foo", 4)
-], ["features","values","labels"])
-
-cnn = (ConditionalKNN().setOutputCol("prediction"))
-cnnm = cnn.fit(df)
-
-test_df = spark.createDataFrame([
- (Vectors.dense(2.0,2.0,2.0), "foo", 1, [0, 1]),
- (Vectors.dense(2.0,2.0,4.0), "foo", 4, [0, 1]),
- (Vectors.dense(2.0,2.0,6.0), "foo", 2, [0, 1]),
- (Vectors.dense(2.0,2.0,8.0), "foo", 4, [0, 1]),
- (Vectors.dense(2.0,2.0,10.0), "foo", 4, [0, 1])
-], ["features","values","labels","conditioner"])
-
-display(cnnm.transform(test_df))
-```
-
-This code should log one run with a ConditionalKNNModel artifact and its parameters.
-
diff --git a/website/docs/mlflow/examples.md b/website/docs/mlflow/examples.md
deleted file mode 100644
index f1745b3aeb..0000000000
--- a/website/docs/mlflow/examples.md
+++ /dev/null
@@ -1,134 +0,0 @@
----
-title: Examples
-description: Examples using SynapseML with MLflow
----
-
-## Prerequisites
-
-If you're using Databricks, install mlflow with this command:
-```
-# run this so that mlflow is installed on workers besides driver
-%pip install mlflow
-```
-
-Install SynapseML based on the [installation guidance](../getting_started/installation.md).
-
-## API Reference
-
-* [mlflow.spark.save_model](https://www.mlflow.org/docs/latest/python_api/mlflow.spark.html#mlflow.spark.save_model)
-* [mlflow.spark.log_model](https://www.mlflow.org/docs/latest/python_api/mlflow.spark.html#mlflow.spark.log_model)
-* [mlflow.spark.load_model](https://www.mlflow.org/docs/latest/python_api/mlflow.spark.html#mlflow.spark.load_model)
-* [mlflow.log_metric](https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.log_metric)
-
-## LightGBMClassificationModel
-
-```python
-import mlflow
-from synapse.ml.featurize import Featurize
-from synapse.ml.lightgbm import *
-from synapse.ml.train import ComputeModelStatistics
-
-with mlflow.start_run():
-
- feature_columns = ["Number of times pregnant","Plasma glucose concentration a 2 hours in an oral glucose tolerance test",
- "Diastolic blood pressure (mm Hg)","Triceps skin fold thickness (mm)","2-Hour serum insulin (mu U/ml)",
- "Body mass index (weight in kg/(height in m)^2)","Diabetes pedigree function","Age (years)"]
- df = spark.createDataFrame([
- (0,131,66,40,0,34.3,0.196,22,1),
- (7,194,68,28,0,35.9,0.745,41,1),
- (3,139,54,0,0,25.6,0.402,22,1),
- (6,134,70,23,130,35.4,0.542,29,1),
- (9,124,70,33,402,35.4,0.282,34,0),
- (0,93,100,39,72,43.4,1.021,35,0),
- (4,110,76,20,100,28.4,0.118,27,0),
- (2,127,58,24,275,27.7,1.6,25,0),
- (0,104,64,37,64,33.6,0.51,22,1),
- (2,120,54,0,0,26.8,0.455,27,0),
- (7,178,84,0,0,39.9,0.331,41,1),
- (2,88,58,26,16,28.4,0.766,22,0),
- (1,91,64,24,0,29.2,0.192,21,0),
- (10,101,76,48,180,32.9,0.171,63,0),
- (5,73,60,0,0,26.8,0.268,27,0),
- (3,158,70,30,328,35.5,0.344,35,1),
- (2,105,75,0,0,23.3,0.56,53,0),
- (12,84,72,31,0,29.7,0.297,46,1),
- (9,119,80,35,0,29.0,0.263,29,1),
- (6,93,50,30,64,28.7,0.356,23,0),
- (1,126,60,0,0,30.1,0.349,47,1)
- ], feature_columns+["labels"]).repartition(2)
-
-
- featurize = (Featurize()
- .setOutputCol("features")
- .setInputCols(feature_columns)
- .setOneHotEncodeCategoricals(True)
- .setNumFeatures(4096))
-
- df_trans = featurize.fit(df).transform(df)
-
- lightgbm_classifier = (LightGBMClassifier()
- .setFeaturesCol("features")
- .setRawPredictionCol("rawPrediction")
- .setDefaultListenPort(12402)
- .setNumLeaves(5)
- .setNumIterations(10)
- .setObjective("binary")
- .setLabelCol("labels")
- .setLeafPredictionCol("leafPrediction")
- .setFeaturesShapCol("featuresShap"))
-
- lightgbm_model = lightgbm_classifier.fit(df_trans)
-
- # Use mlflow.spark.save_model to save the model to your path
- mlflow.spark.save_model(lightgbm_model, "lightgbm_model")
- # Use mlflow.spark.log_model to log the model if you have a connected mlflow service
- mlflow.spark.log_model(lightgbm_model, "lightgbm_model")
-
- # Use mlflow.pyfunc.load_model to load model back as PyFuncModel and apply predict
- prediction = mlflow.pyfunc.load_model("lightgbm_model").predict(df_trans.toPandas())
- prediction = list(map(str, prediction))
- mlflow.log_param("prediction", ",".join(prediction))
-
- # Use mlflow.spark.load_model to load model back as PipelineModel and apply transform
- predictions = mlflow.spark.load_model("lightgbm_model").transform(df_trans)
- metrics = ComputeModelStatistics(evaluationMetric="classification", labelCol='labels', scoredLabelsCol='prediction').transform(predictions).collect()
- mlflow.log_metric("accuracy", metrics[0]['accuracy'])
-```
-
-## Cognitive Services
-
-```python
-import mlflow
-from synapse.ml.cognitive import *
-
-with mlflow.start_run():
-
- text_key = "YOUR_COG_SERVICE_SUBSCRIPTION_KEY"
- df = spark.createDataFrame([
- ("I am so happy today, its sunny!", "en-US"),
- ("I am frustrated by this rush hour traffic", "en-US"),
- ("The cognitive services on spark aint bad", "en-US"),
- ], ["text", "language"])
-
- sentiment_model = (TextSentiment()
- .setSubscriptionKey(text_key)
- .setLocation("eastus")
- .setTextCol("text")
- .setOutputCol("prediction")
- .setErrorCol("error")
- .setLanguageCol("language"))
-
- display(sentiment_model.transform(df))
-
- mlflow.spark.save_model(sentiment_model, "sentiment_model")
- mlflow.spark.log_model(sentiment_model, "sentiment_model")
-
- output_df = mlflow.spark.load_model("sentiment_model").transform(df)
- display(output_df)
-
- # In order to call the predict function successfully you need to specify the
- # outputCol name as `prediction`
- prediction = mlflow.pyfunc.load_model("sentiment_model").predict(df.toPandas())
- prediction = list(map(str, prediction))
- mlflow.log_param("prediction", ",".join(prediction))
-```
diff --git a/website/docs/mlflow/installation.md b/website/docs/mlflow/installation.md
deleted file mode 100644
index ac67a23724..0000000000
--- a/website/docs/mlflow/installation.md
+++ /dev/null
@@ -1,66 +0,0 @@
----
-title: Mlflow Installation
-description: install Mlflow on different environments
----
-
-## Installation
-
-Install MLflow from PyPI via `pip install mlflow`
-
-MLflow requires `conda` to be on the `PATH` for the projects feature.
-
-Learn more about MLflow on their [GitHub page](https://github.com/mlflow/mlflow).
-
-
-### Install Mlflow on Databricks
-
-If you're using Databricks, install Mlflow with this command:
-```
-# run this so that Mlflow is installed on workers besides driver
-%pip install mlflow
-```
-
-### Install Mlflow on Synapse
-To log model with Mlflow, you need to create an Azure Machine Learning workspace and link it with your Synapse workspace.
-
-#### Create Azure Machine Learning Workspace
-
-Follow this document to create [AML workspace](https://learn.microsoft.com/en-us/azure/machine-learning/quickstart-create-resources#create-the-workspace). You don't need to create compute instance and compute clusters.
-
-#### Create an Azure ML Linked Service
-
-
-
-- In the Synapse workspace, go to **Manage** -> **External connections** -> **Linked services**, select **+ New**
-- Select the workspace you want to log the model in and create the linked service. You need the **name of the linked service** to set up connection.
-
-#### Auth Synapse Workspace
-
-
-- Go to the **Azure Machine Learning workspace** resource -> **access control (IAM)** -> **Role assignment**, select **+ Add**, choose **Add role assignment**
-- Choose **contributor**, select next
-- In members page, choose **Managed identity**, select **+ select members**. Under **managed identity**, choose Synapse workspace. Under **Select**, choose the workspace you run your experiment on. Click **Select**, **Review + assign**.
-
-
-#### Use Mlflow in Synapse
-Set up connection
-```python
-
-#AML workspace authentication using linked service
-from notebookutils.mssparkutils import azureML
-linked_service_name = "YourLinkedServiceName"
-ws = azureML.getWorkspace(linked_service_name)
-mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
-
-#Set MLflow experiment.
-experiment_name = "synapse-mlflow-experiment"
-mlflow.set_experiment(experiment_name)
-```
-
-#### Alternative (Don't need Linked Service)
-Once you create an AML workspace, you can obtain the MLflow tracking URL directly. The AML start page is where you can locate the MLflow tracking URL.
-
-You can set it tracking url with
-```python
-mlflow.set_tracking_uri("your mlflow tracking url")
-```
diff --git a/website/docs/mlflow/introduction.md b/website/docs/mlflow/introduction.md
deleted file mode 100644
index 8ed1077fbd..0000000000
--- a/website/docs/mlflow/introduction.md
+++ /dev/null
@@ -1,13 +0,0 @@
----
-title: Introduction
-description: MLflow support of SynapseML
----
-
-## What is MLflow
-
-[MLflow](https://github.com/mlflow/mlflow) is a platform to streamline machine learning development, including tracking experiments, packaging code into reproducible runs, and sharing and deploying models. MLflow offers a set of lightweight APIs that can be used with any existing machine learning application or library, for instance TensorFlow, PyTorch, XGBoost, etc. It runs wherever you currently run ML code, for example, in notebooks, standalone applications or the cloud. MLflow's current components are:
-
-* [MLflow Tracking](https://mlflow.org/docs/latest/tracking.html): An API to log parameters, code, and results in machine learning experiments and compare them using an interactive UI.
-* [MLflow Projects](https://mlflow.org/docs/latest/projects.html): A code packaging format for reproducible runs using Conda and Docker, so you can share your ML code with others.
-* [MLflow Models](https://mlflow.org/docs/latest/models.html): A model packaging format and tools that let you easily deploy the same model from any ML library for both batch and real-time scoring. It supports platforms such as Docker, Apache Spark, Azure ML and AWS SageMaker.
-* [MLflow Model Registry](https://mlflow.org/docs/latest/model-registry.html): A centralized model store, set of APIs, and UI, to collaboratively manage the full lifecycle of MLflow Models.
diff --git a/website/docs/reference/R-setup.md b/website/docs/reference/R-setup.md
deleted file mode 100644
index 1cb70d19dd..0000000000
--- a/website/docs/reference/R-setup.md
+++ /dev/null
@@ -1,150 +0,0 @@
----
-title: R setup
-hide_title: true
-sidebar_label: R setup
-description: R setup and example for SynapseML
----
-
-
-# R setup and example for SynapseML
-
-## Installation
-
-**Requirements**: Ensure that R and
-[devtools](https://github.com/hadley/devtools) installed on your
-machine.
-
-Also make sure you have Apache Spark installed. If you are using Sparklyr, you can use [spark-install](https://spark.rstudio.com/packages/sparklyr/latest/reference/spark_install.html). Be sure to specify the correct version. As of this writing, that should be version="3.2". spark_install is a bit eccentric and may install a slightly different version. Be sure that the version you get is one that you want.
-
-On Windows, download [WinUtils.exe](https://github.com/steveloughran/winutils/blob/master/hadoop-3.0.0/bin/winutils.exe) and copy it into the `bin` directory of your Spark installation, e.g. C:\Users\user\AppData\Local\Spark\spark-3.3.2-bin-hadoop3\bin
-
-To install the current SynapseML package for R, first install synapseml-core:
-
-```R
-...
-devtools::install_url("https://mmlspark.azureedge.net/rrr/synapseml-core-0.11.0.zip")
-...
-```
-
-and then install any or all of the following packages, depending on your intended usage:
-
-synapseml-cognitive,
-synapseml-deep-learning,
-synapseml-lightgbm,
-synapseml-opencv,
-synapseml-vw
-
-In other words:
-
-```R
-...
-devtools::install_url("https://mmlspark.azureedge.net/rrr/synapseml-cognitive-0.11.0.zip")
-devtools::install_url("https://mmlspark.azureedge.net/rrr/synapseml-deep-learning-0.11.0.zip")
-devtools::install_url("https://mmlspark.azureedge.net/rrr/synapseml-lightgbm-0.11.0.zip")
-devtools::install_url("https://mmlspark.azureedge.net/rrr/synapseml-opencv-0.11.0.zip")
-devtools::install_url("https://mmlspark.azureedge.net/rrr/synapseml-vw-0.11.0.zip")
-...
-```
-
-### Importing libraries and setting up spark context
-
-Installing all dependencies may be time-consuming. When complete, run:
-
-```R
-...
-library(sparklyr)
-library(dplyr)
-config <- spark_config()
-config$sparklyr.defaultPackages <- "com.microsoft.azure:synapseml_2.12:0.11.2"
-sc <- spark_connect(master = "local", config = config)
-...
-```
-
-This creates a spark context on your local machine.
-
-We then need to import the R wrappers:
-
-```R
-...
- library(synapseml.core)
- library(synapseml.cognitive)
- library(synapseml.deep.learning)
- library(synapseml.lightgbm)
- library(synapseml.opencv)
- library(synapseml.vw)
-...
-```
-
-## Example
-
-We can use the faithful dataset in R:
-
-```R
-...
-faithful_df <- copy_to(sc, faithful)
-cmd_model = ml_clean_missing_data(
- x=faithful_df,
- inputCols = c("eruptions", "waiting"),
- outputCols = c("eruptions_output", "waiting_output"),
- only.model=TRUE)
-sdf_transform(cmd_model, faithful_df)
-...
-```
-
-You should see the output:
-
-```R
-...
-# Source: table [?? x 4]
-# Database: spark_connection
- eruptions waiting eruptions_output waiting_output
-
- 1 3.600 79 3.600 79
- 2 1.800 54 1.800 54
- 3 3.333 74 3.333 74
- 4 2.283 62 2.283 62
- 5 4.533 85 4.533 85
- 6 2.883 55 2.883 55
- 7 4.700 88 4.700 88
- 8 3.600 85 3.600 85
- 9 1.950 51 1.950 51
- 10 4.350 85 4.350 85
- # ... with more rows
-...
-```
-
-## Azure Databricks
-
-In Azure Databricks, you can install devtools and the spark package from URL
-and then use spark_connect with method = "databricks":
-
-```R
-install.packages("devtools")
-devtools::install_url("https://mmlspark.azureedge.net/rrr/synapseml-0.11.2.zip")
-library(sparklyr)
-library(dplyr)
-sc <- spark_connect(method = "databricks")
-faithful_df <- copy_to(sc, faithful)
-unfit_model = ml_light_gbmregressor(sc, maxDepth=20, featuresCol="waiting", labelCol="eruptions", numIterations=10, unfit.model=TRUE)
-ml_train_regressor(faithful_df, labelCol="eruptions", unfit_model)
-```
-
-## Building from Source
-
-Our R bindings are built as part of the [normal build
-process](developer-readme.md). To get a quick build, start at the root
-of the synapseml directory, and find the generated files. For instance,
-to find the R files for deep-learning, run
-
-```bash
-sbt packageR
-ls ./deep-learning/target/scala-2.12/generated/src/R/synapseml/R
-```
-
-You can then run R in a terminal and install the above files directly:
-
-```R
-...
-devtools::install_local("./deep-learning/target/scala-2.12/generated/src/R/synapseml/R")
-...
-```
diff --git a/website/docs/reference/contributing_guide.md b/website/docs/reference/contributing_guide.md
deleted file mode 100644
index 341edbd548..0000000000
--- a/website/docs/reference/contributing_guide.md
+++ /dev/null
@@ -1,89 +0,0 @@
----
-title: Contributing Guide
-hide_title: true
-sidebar_label: Contributing Guide
-description: Contributing Guide
----
-
-## Interested in contributing to SynapseML? We're excited to work with you.
-
-### You can contribute in many ways:
-
-- Use the library and give feedback: report bugs, request features.
-- Add sample Jupyter notebooks, Python or Scala code examples, documentation
- pages.
-- Fix bugs and issues.
-- Add new features, such as data transformations or machine learning algorithms.
-- Review pull requests from other contributors.
-
-### How to contribute?
-
-You can give feedback, report bugs and request new features anytime by opening
-an issue. Also, you can up-vote or comment on existing issues.
-
-If you want to add code, examples or documentation to the repository, follow
-this process:
-
-#### Propose a contribution
-
-- Preferably, get started by tackling existing issues to get yourself acquainted
- with the library source and the process.
-- To ensure your contribution is a good fit and doesn't duplicate
- on-going work, open an issue or comment on an existing issue. In it, discuss
- your contribution and design.
-- Any algorithm you're planning to contribute should be well known and accepted
- for production use, and backed by research papers.
-- Algorithms should be highly scalable and suitable for massive datasets.
-- All contributions need to comply with the MIT License. Contributors external
- to Microsoft need to sign CLA.
-
-#### Implement your contribution
-
-- Fork the SynapseML repository.
-- Implement your algorithm in Scala, using our wrapper generation mechanism to
- produce PySpark bindings.
-- Use SparkML `PipelineStage`s so your algorithm can be used as a part of
- pipeline.
-- For parameters use `MMLParam`s.
-- Implement model saving and loading by extending SparkML `MLReadable`.
-- Use good Scala style.
-- Binary dependencies should be on Maven Central.
-- See this [pull request](https://github.com/Microsoft/SynapseML/pull/22) for an
- example contribution.
-
-#### Implement tests
-
-- Set up build environment. Use a Linux machine or VM (we use Ubuntu, but other
- distros should work too).
-- Test your code locally.
-- Add tests using ScalaTests. Unit tests are required.
-- A sample notebook is required as an end-to-end test.
-
-#### Implement documentation
-
-- Add a [sample Jupyter notebook](https://github.com/microsoft/SynapseML/tree/master/notebooks) that shows the intended use
- case of your algorithm, with instructions in step-by-step manner. (The same
- notebook could be used for testing the code.)
-- Add in-line ScalaDoc comments to your source code, to generate the [API
- reference documentation](https://mmlspark.azureedge.net/docs/pyspark/)
-
-#### Open a pull request
-
-- In most cases, you should squash your commits into one.
-- Open a pull request, and link it to the discussion issue you created earlier.
-- A SynapseML core team member will trigger a build to test your changes.
-- Fix any build failures. (The pull request will have comments from the build
- with useful links.)
-- Wait for code reviews from core team members and others.
-- Fix issues found in code review and reiterate.
-
-#### Build and check-in
-
-- Wait for a core team member to merge your code in.
-- Your feature will be available through a Docker image and script installation
- in the next release, which typically happens around once a month. You can try
- out your features sooner by using build artifacts for the version that has
- your changes merged in (such versions end with a `.devN`).
-
-If in doubt about how to do something, see how it was done in existing code or
-pull requests, and don't hesitate to ask.
diff --git a/website/docs/reference/dotnet-setup.md b/website/docs/reference/dotnet-setup.md
deleted file mode 100644
index e839b7548e..0000000000
--- a/website/docs/reference/dotnet-setup.md
+++ /dev/null
@@ -1,247 +0,0 @@
----
-title: .NET setup
-hide_title: true
-sidebar_label: .NET setup
-description: .NET setup and example for SynapseML
----
-
-import Tabs from '@theme/Tabs';
-import TabItem from '@theme/TabItem';
-
-
-# .NET setup and example for SynapseML
-
-## Installation
-
-### 1. Install .NET
-
-To start building .NET apps, you need to download and install the .NET SDK (Software Development Kit).
-
-Download and install the [.NET Core SDK](https://dotnet.microsoft.com/en-us/download/dotnet/3.1).
-Installing the SDK adds the dotnet toolchain to your PATH.
-
-Once you've installed the .NET Core SDK, open a new command prompt or terminal. Then run `dotnet`.
-
-If the command runs and prints information about how to use dotnet, you can move to the next step.
-If you receive a `'dotnet' is not recognized as an internal or external command` error, make sure
-you opened a new command prompt or terminal before running the command.
-
-### 2. Install Java
-
-Install [Java 8.1](https://www.oracle.com/java/technologies/downloads/#java8) for Windows and macOS,
-or [OpenJDK 8](https://openjdk.org/install/) for Ubuntu.
-
-Select the appropriate version for your operating system. For example, select jdk-8u201-windows-x64.exe
-for a Windows x64 machine or jdk-8u231-macosx-x64.dmg for macOS. Then, use the command java to verify the installation.
-
-### 3. Install Apache Spark
-
-[Download and install Apache Spark](https://spark.apache.org/downloads.html) with version >= 3.2.0.
-(SynapseML v0.11.2 only supports spark version >= 3.2.0)
-
-Extract downloaded zipped files (with 7-Zip app on Windows or `tar` on linux) and remember the location of
-extracted files, we take `~/bin/spark-3.2.0-bin-hadoop3.2/` as an example here.
-
-Run the following commands to set the environment variables used to locate Apache Spark.
-On Windows, make sure to run the command prompt in administrator mode.
-
-
-
- setx /M HADOOP_HOME C:\bin\spark-3.2.0-bin-hadoop3.2\
- setx /M SPARK_HOME C:\bin\spark-3.2.0-bin-hadoop3.2\
- setx /M PATH "%PATH%;%HADOOP_HOME%;%SPARK_HOME%bin" # Warning: Don't run this if your path is already long as it will truncate your path to 1024 characters and potentially remove entries!
-
-
-
-
- export SPARK_HOME=~/bin/spark-3.2.0-bin-hadoop3.2/
- export PATH="$SPARK_HOME/bin:$PATH"
- source ~/.bashrc
-
-
-
-
-Once you've installed everything and set your environment variables, open a **new** command prompt or terminal and run the following command:
-```bash
-spark-submit --version
-```
-If the command runs and prints version information, you can move to the next step.
-
-If you receive a `'spark-submit' is not recognized as an internal or external command` error, make sure you opened a **new** command prompt.
-
-### 4. Install .NET for Apache Spark
-
-Download the [Microsoft.Spark.Worker](https://github.com/dotnet/spark/releases) **v2.1.1** release from the .NET for Apache Spark GitHub.
-For example if you're on a Windows machine and plan to use .NET Core, download the Windows x64 netcoreapp3.1 release.
-
-Extract Microsoft.Spark.Worker and remember the location.
-
-### 5. Install WinUtils (Windows Only)
-
-.NET for Apache Spark requires WinUtils to be installed alongside Apache Spark.
-[Download winutils.exe](https://github.com/steveloughran/winutils/blob/master/hadoop-3.0.0/bin/winutils.exe).
-Then, copy WinUtils into C:\bin\spark-3.2.0-bin-hadoop3.2\bin.
-:::note
-If you're using a different version of Hadoop, select the version of WinUtils that's compatible with your version of Hadoop. You can see the Hadoop version at the end of your Spark install folder name.
-:::
-
-### 6. Set DOTNET_WORKER_DIR and check dependencies
-
-Run one of the following commands to set the DOTNET_WORKER_DIR environment variable, which is used by .NET apps to locate .NET for Apache Spark
-worker binaries. Make sure to replace with the directory where you downloaded and extracted the Microsoft.Spark.Worker.
-On Windows, make sure to run the command prompt in administrator mode.
-
-
-
-
- setx /M DOTNET_WORKER_DIR
-
-
-
-
- export DOTNET_WORKER_DIR=
-
-
-
-
-Finally, double-check that you can run `dotnet, java, spark-shell` from your command line before you move to the next section.
-
-## Write a .NET for SynapseML App
-
-### 1. Create a console app
-
-In your command prompt or terminal, run the following commands to create a new console application:
-```powershell
-dotnet new console -o SynapseMLApp
-cd SynapseMLApp
-```
-The `dotnet` command creates a new application of type console for you. The -o parameter creates a directory
-named `SynapseMLApp` where your app is stored and populates it with the required files.
-The `cd SynapseMLApp` command changes the directory to the app directory you created.
-
-### 2. Install NuGet package
-
-To use .NET for Apache Spark in an app, install the Microsoft.Spark package.
-In your command prompt or terminal, run the following command:
-```powershell
-dotnet add package Microsoft.Spark --version 2.1.1
-```
-:::note
-This tutorial uses Microsoft.Spark version 2.1.1 as SynapseML 0.11.2 depends on it.
-Change to corresponding version if necessary.
-:::
-
-To use SynapseML features in the app, install SynapseML.X package.
-In this tutorial, we use SynapseML.Cognitive as an example.
-In your command prompt or terminal, run the following command:
-```powershell
-# Update Nuget Config to include SynapseML Feed
-dotnet nuget add source https://mmlspark.blob.core.windows.net/synapsemlnuget/index.json -n SynapseMLFeed
-dotnet add package SynapseML.Cognitive --version 0.11.2
-```
-The `dotnet nuget add` command adds SynapseML's resolver to the source, so that our package can be found.
-
-### 3. Write your app
-Open Program.cs in Visual Studio Code, or any text editor. Replace its contents with this code:
-```csharp
-using System;
-using System.Collections.Generic;
-using Synapse.ML.Cognitive;
-using Microsoft.Spark.Sql;
-using Microsoft.Spark.Sql.Types;
-
-namespace SynapseMLApp
-{
- class Program
- { static void Main(string[] args)
- {
- // Create Spark session
- SparkSession spark =
- SparkSession
- .Builder()
- .AppName("TextSentimentExample")
- .GetOrCreate();
-
- // Create DataFrame
- DataFrame df = spark.CreateDataFrame(
- new List
- {
- new GenericRow(new object[] {"I am so happy today, its sunny!", "en-US"}),
- new GenericRow(new object[] {"I am frustrated by this rush hour traffic", "en-US"}),
- new GenericRow(new object[] {"The cognitive services on spark aint bad", "en-US"})
- },
- new StructType(new List
- {
- new StructField("text", new StringType()),
- new StructField("language", new StringType())
- })
- );
-
- // Create TextSentiment
- var model = new TextSentiment()
- .SetSubscriptionKey("YOUR_SUBSCRIPTION_KEY")
- .SetLocation("eastus")
- .SetTextCol("text")
- .SetOutputCol("sentiment")
- .SetErrorCol("error")
- .SetLanguageCol("language");
-
- // Transform
- var outputDF = model.Transform(df);
-
- // Display results
- outputDF.Show();
-
- // Stop Spark session
- spark.Stop();
- }
- }
-}
-```
-[SparkSession](https://docs.microsoft.com/en-us/dotnet/api/microsoft.spark.sql.sparksession?view=spark-dotnet) is the entrypoint
-of Apache Spark applications, which manages the context and information of your application. A DataFrame is a way of organizing
-data into a set of named columns.
-
-Create a [TextSentiment](https://mmlspark.blob.core.windows.net/docs/0.11.2/dotnet/classSynapse_1_1ML_1_1Cognitive_1_1TextSentiment.html)
-instance, set corresponding subscription key and other configurations. Then, apply transformation to the dataframe,
-which analyzes the sentiment based on each row, and stores result into output column.
-
-The result of the transformation is stored in another DataFrame. At this point, no operations have taken place because
-.NET for Apache Spark lazily evaluates the data. The operation defined by the call to model.Transform doesn't execute until the Show method is called to display the contents of the transformed DataFrame to the console. Once you no longer need the Spark
-session, use the Stop method to stop your session.
-
-### 4. Run your .NET App
-Run the following command to build your application:
-```powershell
-dotnet build
-```
-Navigate to your build output directory. For example, in Windows you could run `cd bin\Debug\net5.0`.
-Use the spark-submit command to submit your application to run on Apache Spark.
-```powershell
-spark-submit --class org.apache.spark.deploy.dotnet.DotnetRunner --packages com.microsoft.azure:synapseml_2.12:0.11.2 --master local microsoft-spark-3-2_2.12-2.1.1.jar dotnet SynapseMLApp.dll
-```
-`--packages com.microsoft.azure:synapseml_2.12:0.11.2` specifies the dependency on synapseml_2.12 version 0.11.2;
-`microsoft-spark-3-2_2.12-2.1.1.jar` specifies Microsoft.Spark version 2.1.1 and Spark version 3.2
-:::note
-This command assumes you have downloaded Apache Spark and added it to your PATH environment variable so that you can use spark-submit.
-Otherwise, you'd have to use the full path (for example, C:\bin\apache-spark\bin\spark-submit or ~/spark/bin/spark-submit).
-:::
-
-When your app runs, the sentiment analysis result is written to the console.
-```
-+-----------------------------------------+--------+-----+--------------------------------------------------+
-| text|language|error| sentiment|
-+-----------------------------------------+--------+-----+--------------------------------------------------+
-| I am so happy today, its sunny!| en-US| null|[{positive, null, {0.99, 0.0, 0.0}, [{I am so h...|
-|I am frustrated by this rush hour traffic| en-US| null|[{negative, null, {0.0, 0.0, 0.99}, [{I am frus...|
-| The cognitive services on spark aint bad| en-US| null|[{negative, null, {0.0, 0.01, 0.99}, [{The cogn...|
-+-----------------------------------------+--------+-----+--------------------------------------------------+
-```
-Congratulations! You successfully authored and ran a .NET for SynapseML app.
-Refer to the [developer docs](https://mmlspark.blob.core.windows.net/docs/0.11.2/dotnet/index.html) for API guidance.
-
-## Next
-
-* Refer to this [tutorial](https://docs.microsoft.com/en-us/dotnet/spark/tutorials/databricks-deployment) for deploying a .NET app to Databricks.
-* You could download compatible [install-worker.sh](https://mmlspark.blob.core.windows.net/publicwasb/dotnet/install-worker.sh)
-and [db-init.sh](https://mmlspark.blob.core.windows.net/publicwasb/dotnet/db-init.sh) files needed for deployment on Databricks.
diff --git a/website/docs/reference/vagrant.md b/website/docs/reference/vagrant.md
deleted file mode 100644
index 4d182a4f3f..0000000000
--- a/website/docs/reference/vagrant.md
+++ /dev/null
@@ -1,44 +0,0 @@
----
-title: Vagrant
-hide_title: true
-sidebar_label: Vagrant
----
-
-
-# Using the SynapseML Vagrant Image
-
-## Install Vagrant and Dependencies
-
-You'll need a few dependencies before we get started. These instructions are for using Vagrant on Windows OS.
-
-1. Ensure [Hyper-V](https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/) is enabled or install [VirtualBox](https://www.virtualbox.org/)
-2. Install an X Server for Windows, [VcXsrv](https://sourceforge.net/projects/vcxsrv/) is a lightweight option.
-3. Install the Vagrant version for your OS [here](https://www.vagrantup.com/downloads.html)
-
-## Build the Vagrant Image
-
-Start PowerShell as Administrator and go to the `synapseml/tools/vagrant` directory and run
-
- vagrant up
-
-_Note: you may need to select a network switch, try the Default Switch option if possible_
-
-## Connect to the Vagrant Image
-
-First start the X-Window server (use 'XLaunch' if using VcXsrv).
-
-From the same directory (with PowerShell as Administrator) run
-
- $env:DISPLAY="localhost:0"
- vagrant ssh -- -Y
-
- # now you can start IntelliJ and interact with the GUI
- > idea
-
-## Stop the Vagrant Image
-
- vagrant halt
-
-## Further reading
-
-This guide covers the bare minimum for running a Vagrant image. For more information, see the [Vagrant Documentation](https://www.vagrantup.com/intro/index.html).
diff --git a/website/docs/third-party-notices.txt b/website/docs/third-party-notices.txt
deleted file mode 100644
index 58540ba262..0000000000
--- a/website/docs/third-party-notices.txt
+++ /dev/null
@@ -1,298 +0,0 @@
-================================================================================
-*** OpenCV
-================================================================================
-
-By downloading, copying, installing or using the software you agree to
-this license. If you do not agree to this license, do not download,
-install, copy or use the software.
-
-
- License Agreement
- For Open Source Computer Vision Library
- (3-clause BSD License)
-
-Copyright (C) 2000-2016, Intel Corporation, all rights reserved.
-Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
-Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved.
-Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
-Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved.
-Copyright (C) 2015-2016, Itseez Inc., all rights reserved.
-Third party copyrights are property of their respective owners.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
- * Neither the names of the copyright holders nor the names of the contributors
- may be used to endorse or promote products derived from this software
- without specific prior written permission.
-
-This software is provided by the copyright holders and contributors "as
-is" and any express or implied warranties, including, but not limited
-to, the implied warranties of merchantability and fitness for a
-particular purpose are disclaimed. In no event shall copyright holders
-or contributors be liable for any direct, indirect, incidental, special,
-exemplary, or consequential damages (including, but not limited to,
-procurement of substitute goods or services; loss of use, data, or
-profits; or business interruption) however caused and on any theory of
-liability, whether in contract, strict liability, or tort (including
-negligence or otherwise) arising in any way out of the use of this
-software, even if advised of the possibility of such damage.
-
-
-
-================================================================================
-*** File with code "taken from" PCL library
-================================================================================
-
-Software License Agreement (BSD License)
-
-Point Cloud Library (PCL) - www.pointclouds.org
-Copyright (c) 2009-2012, Willow Garage, Inc.
-Copyright (c) 2012-, Open Perception, Inc.
-Copyright (c) XXX, respective authors.
-
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the copyright holder(s) nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-
-================================================================================
-*** KAZE
-================================================================================
-
-Copyright (c) 2012, Pablo Fernández Alcantarilla
-All Rights Reserved
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
-
- * Neither the name of the copyright holders nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
-TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-
-================================================================================
-*** libwebp
-================================================================================
-
-Copyright (c) 2010, Google Inc. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the name of Google nor the names of its contributors may be
- used to endorse or promote products derived from this software
- without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
-TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-Additional IP Rights Grant (Patents)
-------------------------------------
-
-"These implementations" means the copyrightable works that implement the
-WebM codecs distributed by Google as part of the WebM Project.
-
-Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
-royalty-free, irrevocable (except as stated in this section) patent license to
-make, have made, use, offer to sell, sell, import, transfer, and otherwise
-run, modify and propagate the contents of these implementations of WebM, where
-such license applies only to those patent claims, both currently owned by
-Google and acquired in the future, licensable by Google that are necessarily
-infringed by these implementations of WebM. This grant does not include claims
-that would be infringed only as a consequence of further modification of these
-implementations. If you or your agent or exclusive licensee institute or order
-or agree to the institution of patent litigation or any other patent
-enforcement activity against any entity (including a cross-claim or
-counterclaim in a lawsuit) alleging that any of these implementations of WebM
-or any code incorporated within any of these implementations of WebM
-constitute direct or contributory patent infringement, or inducement of
-patent infringement, then any patent rights granted to you under this License
-for these implementations of WebM shall terminate as of the date such
-litigation is filed."
-
-
-
-================================================================================
-*** File with code "based on" a message of Laurent Pinchart on the
-*** video4linux mailing list
-================================================================================
-
-LEGAL ISSUES
-============
-
-In plain English:
-
-1. We don't promise that this software works. (But if you find any
- bugs, please let us know!)
-2. You can use this software for whatever you want. You don't have to
- pay us.
-3. You may not pretend that you wrote this software. If you use it in a
- program, you must acknowledge somewhere in your documentation that
- you've used the IJG code.
-
-In legalese:
-
-The authors make NO WARRANTY or representation, either express or
-implied, with respect to this software, its quality, accuracy,
-merchantability, or fitness for a particular purpose. This software is
-provided "AS IS", and you, its user, assume the entire risk as to its
-quality and accuracy.
-
-This software is copyright (C) 1991-2013, Thomas G. Lane, Guido
-Vollbeding. All Rights Reserved except as specified below.
-
-Permission is hereby granted to use, copy, modify, and distribute this
-software (or portions thereof) for any purpose, without fee, subject to
-these conditions:
-(1) If any part of the source code for this software is distributed,
- then this README file must be included, with this copyright and
- no-warranty notice unaltered; and any additions, deletions, or
- changes to the original files must be clearly indicated in
- accompanying documentation.
-(2) If only executable code is distributed, then the accompanying
- documentation must state that "this software is based in part on the
- work of the Independent JPEG Group".
-(3) Permission for use of this software is granted only if the user
- accepts full responsibility for any undesirable consequences; the
- authors accept NO LIABILITY for damages of any kind.
-
-These conditions apply to any software derived from or based on the IJG
-code, not just to the unmodified library. If you use our work, you
-ought to acknowledge us.
-
-Permission is NOT granted for the use of any IJG author's name or
-company name in advertising or publicity relating to this software or
-products derived from it. This software may be referred to only as "the
-Independent JPEG Group's software".
-
-We specifically permit and encourage the use of this software as the
-basis of commercial products, provided that all warranty or liability
-claims are assumed by the product vendor.
-
-The Unix configuration script "configure" was produced with GNU
-Autoconf. It is copyright by the Free Software Foundation but is freely
-distributable. The same holds for its supporting scripts (config.guess,
-config.sub, ltmain.sh). Another support script, install-sh, is
-copyright by X Consortium but is also freely distributable.
-
-The IJG distribution formerly included code to read and write GIF files.
-To avoid entanglement with the Unisys LZW patent, GIF reading support
-has been removed altogether, and the GIF writer has been simplified to
-produce "uncompressed GIFs". This technique does not use the LZW
-algorithm; the resulting GIF files are larger than usual, but are
-readable by all standard GIF decoders.
-
-We are required to state that
- "The Graphics Interchange Format(c) is the Copyright property of
- CompuServe Incorporated. GIF(sm) is a Service Mark property of
- CompuServe Incorporated."
-
-
-
-================================================================================
-*** File with code copyright Yossi Rubner, as well as code copyright
-*** MD-Mathematische Dienste GmbH
-================================================================================
-
- Copyright (c) 2002,
- MD-Mathematische Dienste GmbH
- Im Defdahl 5-10
- 44141 Dortmund
- Germany
- www.md-it.de
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-Redistributions of source code must retain the above copyright notice,
-this list of conditions and the following disclaimer. Redistributions
-in binary form must reproduce the above copyright notice, this list of
-conditions and the following disclaimer in the documentation and/or
-other materials provided with the distribution. The name of Contributor
-may not be used to endorse or promote products derived from this
-software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/website/doctest.py b/website/doctest.py
index 7ddfb615a9..6ab47b688d 100644
--- a/website/doctest.py
+++ b/website/doctest.py
@@ -58,7 +58,7 @@ def iterate_over_documentation(folder, version):
def main(version):
cur_path = os.getcwd()
- folder = os.path.join(cur_path, "website", "docs", "documentation")
+ folder = os.path.join(cur_path, "docs", "Quick Examples")
iterate_over_documentation(folder, version)
os.chdir(folder)
os.system(
diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js
index 2d6a3e3e2a..8e92b04f5a 100644
--- a/website/docusaurus.config.js
+++ b/website/docusaurus.config.js
@@ -1,7 +1,6 @@
const math = require('remark-math')
const katex = require('rehype-katex')
const path = require('path');
-const { all_examples } = require('./src/plugins/examples');
let version = "0.11.2";
module.exports = {
@@ -14,7 +13,6 @@ module.exports = {
projectName: 'SynapseML',
trailingSlash: true,
customFields: {
- examples: all_examples(),
version: "0.11.2",
},
stylesheets: [
@@ -41,7 +39,7 @@ module.exports = {
src: 'img/logo.svg',
},
items: [
- { to: 'docs/about', label: 'Docs', position: 'left' },
+ { to: 'docs/Overview', label: 'Docs', position: 'left' },
{ to: 'blog', label: 'Blog', position: 'left' },
{ to: 'videos', label: 'Videos', position: 'left' },
{
@@ -86,11 +84,11 @@ module.exports = {
items: [
{
label: 'Installation',
- to: 'docs/getting_started/installation',
+ to: 'docs/Get%20Started/install%20SynapseML',
},
{
label: 'Getting Started',
- to: 'docs/getting_started/first_example',
+ to: 'docs/Get%20Started/Quickstart%20-%20Your%20First%20Models',
},
{
label: 'Python API Reference',
diff --git a/website/notebookconvert.py b/website/notebookconvert.py
deleted file mode 100644
index 85afd69870..0000000000
--- a/website/notebookconvert.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import io
-import os
-import re
-
-
-def add_header_to_markdown(folder, md):
- name = md[:-3]
- with io.open(os.path.join(folder, md), "r+", encoding="utf-8") as f:
- content = f.read()
- f.truncate(0)
- content = re.sub(r"style=\"[\S ]*?\"", "", content)
- content = re.sub(r"