docs: Refactor docs and docgen framework (microsoft#2021)

* docs: Refactor docs and doc generation system * remove old docusaurus versions * fix broken links * fix old references * fix build issues * small fix * fix notebook upload * standardize header * remove boilerplate * remove boilerplate * update boilerplate * update boilerplate * fix docgen structure * fixes
JessicaXYWang · Sep 14, 2023 · 19f898c · 19f898c
1 parent 9e1da76
commit 19f898c
Show file tree

Hide file tree

Showing 1,008 changed files with 834 additions and 170,276 deletions.
diff --git a/.acrolinx-config.edn b/.acrolinx-config.edn
@@ -1,2 +1,2 @@
 {:allowed-branchname-matches ["master" "release-.*"]
- :allowed-filename-matches ["notebooks" "website"]}
+ :allowed-filename-matches ["docs" "website"]}
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -52,7 +52,7 @@ this process:
 
 #### Implement documentation
 
--   Add a [sample Jupyter notebook](notebooks/) that shows the intended use
+-   Add a [sample Jupyter notebook](docs/) that shows the intended use
     case of your algorithm, with instructions in step-by-step manner.  (The same
     notebook could be used for testing the code.)
 -   Add in-line ScalaDoc comments to your source code, to generate the [API

diff --git a/build.sbt b/build.sbt
@@ -381,11 +381,11 @@ publishBadges := {
   uploadBadge("master version", version.value, "blue", "master_version3.svg")
 }
 
-val uploadNotebooks = TaskKey[Unit]("uploadNotebooks", "upload notebooks to blob storage")
+val uploadNotebooks = TaskKey[Unit]("uploadNotebooks", "upload docs to blob storage")
 uploadNotebooks := {
-  val localNotebooksFolder = join(baseDirectory.value.toString, "notebooks").toString
+  val localNotebooksFolder = join(baseDirectory.value.toString, "docs").toString
   val blobNotebooksFolder = version.value
-  uploadToBlob(localNotebooksFolder, blobNotebooksFolder, "notebooks")
+  uploadToBlob(localNotebooksFolder, blobNotebooksFolder, "docs")
 }
 
 val settings = Seq(
@@ -493,8 +493,8 @@ setupTask := {
 
 val convertNotebooks = TaskKey[Unit]("convertNotebooks", "convert notebooks to markdown for website display")
 convertNotebooks := {
-  runCmdStr("python -m docs.python.documentprojection " +
-    "--customchannels docs/python/synapseml_channels -c website . docs/manifest.yaml -p")
+  runCmd(Seq("pip", "install", "-e", "."), wd=join(baseDirectory.value, "tools/docgen"))
+  runCmd(Seq("python", "__main__.py"), wd=join(baseDirectory.value, "tools/docgen/docgen"))
 }
 
 val testWebsiteDocs = TaskKey[Unit]("testWebsiteDocs",

diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/core/env/FileUtilities.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/core/env/FileUtilities.scala
@@ -68,6 +68,13 @@ object FileUtilities {
     ()
   }
 
+  def copyAndRenameFile(from: File, toDir: File, newName: String, overwrite: Boolean = false): Unit = {
+    Files.copy(from.toPath, new File(toDir, newName).toPath,
+      (if (overwrite) Seq(StandardCopyOption.REPLACE_EXISTING)
+      else Seq()): _*)
+    ()
+  }
+
   // Perhaps this should move into a more specific place, not a generic file utils thing
   def zipFolder(dir: File, out: File): Unit = {
     import java.io.{BufferedInputStream, FileInputStream, FileOutputStream}

diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala
@@ -86,17 +86,17 @@ object DatabricksUtilities {
   // Execution Params
   val TimeoutInMillis: Int = 40 * 60 * 1000
 
-  val NotebookFiles: Array[File] = FileUtilities.recursiveListFiles(
-    FileUtilities.join(
-      BuildInfo.baseDirectory.getParent, "notebooks", "features").getCanonicalFile)
+  val DocsDir = FileUtilities.join(BuildInfo.baseDirectory.getParent, "docs").getCanonicalFile()
+  val NotebookFiles: Array[File] = FileUtilities.recursiveListFiles(DocsDir)
+    .filter(_.toString.endsWith(".ipynb"))
 
   val ParallelizableNotebooks: Seq[File] = NotebookFiles.filterNot(_.isDirectory)
 
   val CPUNotebooks: Seq[File] = ParallelizableNotebooks
-    .filterNot(_.getAbsolutePath.contains("simple_deep_learning"))
+    .filterNot(_.getAbsolutePath.contains("Fine-tune"))
     .filterNot(_.getAbsolutePath.contains("Explanation Dashboard")) // TODO Remove this exclusion
 
-  val GPUNotebooks: Seq[File] = ParallelizableNotebooks.filter(_.getAbsolutePath.contains("simple_deep_learning"))
+  val GPUNotebooks: Seq[File] = ParallelizableNotebooks.filter(_.getAbsolutePath.contains("Fine-tune"))
 
   def databricksGet(path: String): JsValue = {
     val request = new HttpGet(BaseURL + path)
@@ -336,13 +336,15 @@ object DatabricksUtilities {
   //scalastyle:on cyclomatic.complexity
 
   def uploadAndSubmitNotebook(clusterId: String, notebookFile: File): DatabricksNotebookRun = {
-    val destination: String = Folder + "/" + notebookFile.getName
+    val dirPaths = DocsDir.toURI.relativize(notebookFile.getParentFile.toURI).getPath
+    val folderToCreate = Folder + "/" + dirPaths
+    println(s"Creating folder $folderToCreate")
+    workspaceMkDir(folderToCreate)
+    val destination: String = folderToCreate + notebookFile.getName
     uploadNotebook(notebookFile, destination)
     val runId: Int = submitRun(clusterId, destination)
     val run: DatabricksNotebookRun = DatabricksNotebookRun(runId, notebookFile.getName)
-
     println(s"Successfully submitted job run id ${run.runId} for notebook ${run.notebookName}")
-
     run
   }
 
@@ -413,9 +415,6 @@ abstract class DatabricksTestHelper extends TestBase {
       assert(areLibrariesInstalled(clusterId))
     }
 
-    println(s"Creating folder $Folder")
-    workspaceMkDir(Folder)
-
     println(s"Submitting jobs")
     val parNotebookRuns: Seq[DatabricksNotebookRun] = notebooks.map(uploadAndSubmitNotebook(clusterId, _))
     parNotebookRuns.foreach(notebookRun => jobIdsToCancel.append(notebookRun.runId))

diff --git a/...src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SharedNotebookE2ETestUtilities.scala b/...src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SharedNotebookE2ETestUtilities.scala
@@ -4,34 +4,76 @@
 package com.microsoft.azure.synapse.ml.nbtest
 
 import com.microsoft.azure.synapse.ml.build.BuildInfo
-import com.microsoft.azure.synapse.ml.core.env.FileUtilities
+import com.microsoft.azure.synapse.ml.core.env.{FileUtilities, StreamUtilities}
 import org.apache.commons.io.FileUtils
 
 import java.io.File
 import java.lang.ProcessBuilder.Redirect
 import scala.sys.process._
-
+import scala.io.Source
+import java.io.{BufferedWriter, File, FileWriter}
 
 object SharedNotebookE2ETestUtilities {
   val ResourcesDirectory = new File(getClass.getResource("/").toURI)
   val NotebooksDir = new File(ResourcesDirectory, "generated-notebooks")
+  val NotebookPreamble: String =
+    """
+      |# In[ ]:
+      |
+      |
+      |# This cell ensures make magic command like '%pip install' works on synapse scheduled spark jobs
+      |from synapse.ml.core.platform import running_on_synapse
+      |
+      |if running_on_synapse():
+      |    from IPython import get_ipython
+      |    from IPython.terminal.interactiveshell import TerminalInteractiveShell
+      |    from synapse.ml.core.platform import materializing_display as display
+      |    from pyspark.sql import SparkSession
+      |
+      |    spark = SparkSession.builder.getOrCreate()
+      |    try:
+      |        shell = TerminalInteractiveShell.instance()
+      |    except:
+      |        pass
+      |
+      |""".stripMargin
+
+  def insertTextInFile(file: File, textToPrepend: String, locToInsert: Int): Unit = {
+    val existingLines = StreamUtilities.using(Source.fromFile(file)) { s =>
+      s.getLines().toList
+    }.get
+    val linesBefore = existingLines.take(locToInsert)
+    val linesAfter = existingLines.takeRight(existingLines.length - locToInsert)
+    val linesInMiddle = textToPrepend.split("\n")
+    val newText = (linesBefore ++ linesInMiddle ++ linesAfter).mkString("\n")
+    StreamUtilities.using(new BufferedWriter(new FileWriter(file))) { writer =>
+      writer.write(newText)
+    }
+  }
 
   def generateNotebooks(): Unit = {
     cleanUpGeneratedNotebooksDir()
 
-    FileUtilities.recursiveListFiles(FileUtilities
-      .join(BuildInfo.baseDirectory.getParent, "notebooks/features")
-      .getCanonicalFile)
+    val docsDir = FileUtilities.join(BuildInfo.baseDirectory.getParent, "docs").getCanonicalFile
+    val newFiles = FileUtilities.recursiveListFiles(docsDir)
       .filter(_.getName.endsWith(".ipynb"))
       .map { f =>
-        FileUtilities.copyFile(f, NotebooksDir, true)
-        val newFile = new File(NotebooksDir, f.getName)
-        val targetName = new File(NotebooksDir, f.getName.replace(" ", "").replace("-", ""))
-        newFile.renameTo(targetName)
-        targetName
+        val relative = docsDir.toURI.relativize(f.toURI).getPath
+        val newName = relative
+          .replace("/", "")
+          .replace(" ", "")
+          .replace("-", "")
+          .replace(",", "")
+        FileUtilities.copyAndRenameFile(f, NotebooksDir, newName, true)
+        new File(NotebooksDir, newName)
       }
 
     runCmd(activateCondaEnv ++ Seq("jupyter", "nbconvert", "--to", "python", "*.ipynb"), NotebooksDir)
+
+    newFiles.foreach { f =>
+      insertTextInFile(new File(f.getPath.replace(".ipynb", ".py")), NotebookPreamble, 2)
+    }
+
   }
 
   def cleanUpGeneratedNotebooksDir(): Unit = {

diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala
@@ -44,11 +44,12 @@ class SynapseTests extends TestBase {
 
   val selectedPythonFiles: Array[File] = FileUtilities.recursiveListFiles(SharedNotebookE2ETestUtilities.NotebooksDir)
     .filter(_.getAbsolutePath.endsWith(".py"))
-    .filterNot(_.getAbsolutePath.contains("DeepLearningDeepTextClassification")) // Excluded by design task 1829306
-    .filterNot(_.getAbsolutePath.contains("DeepLearningDeepVisionClassification")) // Excluded by design task 1829306
-    .filterNot(_.getAbsolutePath.contains("VowpalWabbitClassificationusingVWnativeFormat"))
+    .filterNot(_.getAbsolutePath.contains("Finetune")) // Excluded by design task 1829306
+    .filterNot(_.getAbsolutePath.contains("VWnativeFormat"))
     .filterNot(_.getAbsolutePath.contains("VowpalWabbitMulticlassclassification")) // Wait for Synpase fix
     .filterNot(_.getAbsolutePath.contains("Langchain")) // Wait for Synpase fix
+    .filterNot(_.getAbsolutePath.contains("SetupCognitive")) // No code to run
+    .filterNot(_.getAbsolutePath.contains("CreateaSparkCluster")) // No code to run
     .sortBy(_.getAbsolutePath)
 
   val expectedPoolCount: Int = selectedPythonFiles.length

diff --git a/...on-0.10.1/features/spark_serving/about.md → docs/Deploy Models/Overview.md b/...on-0.10.1/features/spark_serving/about.md → docs/Deploy Models/Overview.md
@@ -33,7 +33,7 @@ sidebar_label: About
 
 ### Jupyter Notebook Examples
 
--   [Deploy a classifier trained on the Adult Census Dataset](../SparkServing%20-%20Deploying%20a%20Classifier)
+-   [Deploy a classifier trained on the Adult Census Dataset](../Quickstart%20-%20Deploying%20a%20Classifier)
 -   More coming soon!
 
 ### Spark Serving Hello World

diff --git a/...arkServing - Deploying a Classifier.ipynb → ...Quickstart - Deploying a Classifier.ipynb b/...arkServing - Deploying a Classifier.ipynb → ...Quickstart - Deploying a Classifier.ipynb
@@ -9,31 +9,6 @@
     "First, we import needed packages:"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pyspark.sql import SparkSession\n",
-    "\n",
-    "# Bootstrap Spark Session\n",
-    "spark = SparkSession.builder.getOrCreate()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "import sys\n",
-    "import numpy as np\n",
-    "import pandas as pd"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},

diff --git a/...sage Async, Batching, and Multi-Key.ipynb → ...ge - Async, Batching, and Multi-Key.ipynb b/...sage Async, Batching, and Multi-Key.ipynb → ...ge - Async, Batching, and Multi-Key.ipynb
@@ -33,14 +33,7 @@
   {
    "cell_type": "code",
    "source": [
-    "import os\n",
-    "from pyspark.sql import SparkSession\n",
-    "from synapse.ml.core.platform import running_on_synapse, find_secret\n",
-    "\n",
-    "# Bootstrap Spark Session\n",
-    "spark = SparkSession.builder.getOrCreate()\n",
-    "if running_on_synapse():\n",
-    "    from notebookutils.visualization import display\n",
+    "from synapse.ml.core.platform import find_secret\n",
     "\n",
     "service_key = find_secret(\"cognitive-api-key\")\n",
     "service_loc = \"eastus\""
@@ -264,7 +257,8 @@
   {
    "cell_type": "markdown",
    "source": [
-    "#### Faster without extra hardware:\n<img src=\"https://mmlspark.blob.core.windows.net/graphics/Cog%20Service%20NB/async_relative%20(2).png\" width=\"500\" />"
+    "#### Faster without extra hardware:\n",
+    "<img src=\"https://mmlspark.blob.core.windows.net/graphics/Cog%20Service%20NB/async_relative%20(2).png\" width=\"500\" />"
    ],
    "metadata": {
     "application/vnd.databricks.v1+cell": {
@@ -398,7 +392,9 @@
   {
    "cell_type": "markdown",
    "source": [
-    "## Learn More\n- [Explore other cogntive services](https://microsoft.github.io/SynapseML/docs/features/cognitive_services/CognitiveServices%20-%20Overview/)\n- [Read our paper \"Large-Scale Intelligent Microservices\"](https://arxiv.org/abs/2009.08044)"
+    "## Learn More\n",
+    "- [Explore other cogntive services](https://microsoft.github.io/SynapseML/docs/features/cognitive_services/CognitiveServices%20-%20Overview/)\n",
+    "- [Read our paper \"Large-Scale Intelligent Microservices\"](https://arxiv.org/abs/2009.08044)"
    ],
    "metadata": {
     "application/vnd.databricks.v1+cell": {
@@ -421,8 +417,13 @@
    "language": "python",
    "widgets": {},
    "notebookOrigID": 3743502060540796
+  },
+  "kernelspec": {
+   "name": "python3",
+   "language": "python",
+   "display_name": "Python 3 (ipykernel)"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
+}
diff --git a/...vices/GeospatialServices - Overview.ipynb → ...hms/AI Services/Geospatial Services.ipynb b/...vices/GeospatialServices - Overview.ipynb → ...hms/AI Services/Geospatial Services.ipynb
@@ -53,12 +53,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from pyspark.sql.functions import udf, col\n",
     "from pyspark.sql.types import StructType, StructField, DoubleType\n",
-    "from pyspark.sql.functions import lit\n",
-    "from pyspark.ml import PipelineModel\n",
     "from pyspark.sql.functions import col\n",
-    "import os\n",
     "import requests\n",
     "from requests.adapters import HTTPAdapter\n",
     "from requests.packages.urllib3.util.retry import Retry\n",
@@ -75,21 +71,6 @@
     "http.mount(\"http://\", adapter)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pyspark.sql import SparkSession\n",
-    "from synapse.ml.core.platform import *\n",
-    "\n",
-    "# Bootstrap Spark Session\n",
-    "spark = SparkSession.builder.getOrCreate()\n",
-    "\n",
-    "from synapse.ml.core.platform import materializing_display as display"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -98,6 +79,7 @@
    "source": [
     "from synapse.ml.cognitive import *\n",
     "from synapse.ml.geospatial import *\n",
+    "from synapse.ml.core.platform import *\n",
     "\n",
     "# An Azure Maps account key\n",
     "maps_key = find_secret(\"azuremaps-api-key\")"

diff --git a/...es - Multivariate Anomaly Detection.ipynb → ...ices/Multivariate Anomaly Detection.ipynb b/...es - Multivariate Anomaly Detection.ipynb → ...ices/Multivariate Anomaly Detection.ipynb
@@ -38,20 +38,6 @@
     "Let's start by setting up the environment variables for our service keys. The next cell sets the `ANOMALY_API_KEY` and the `BLOB_CONNECTION_STRING` environment variables based on the values stored in our Azure Key Vault. If you're running this tutorial in your own environment, make sure you set these environment variables before you proceed."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "from pyspark.sql import SparkSession\n",
-    "from synapse.ml.core.platform import find_secret\n",
-    "\n",
-    "# Bootstrap Spark Session\n",
-    "spark = SparkSession.builder.getOrCreate()"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -65,6 +51,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from synapse.ml.core.platform import find_secret\n",
+    "\n",
     "# An Anomaly Dectector subscription key\n",
     "anomalyKey = find_secret(\"anomaly-api-key\")  # use your own anomaly api key\n",
     "# Your storage account name\n",