UtrechtUniversity · jelletreep · May 16, 2024 · May 16, 2024
diff --git a/playbooks/roles/whisper/files/whisper_template.ipynb b/playbooks/roles/whisper/files/whisper_template.ipynb
@@ -185,9 +185,7 @@
     "\n",
     "Step 1: create a token [here](https://huggingface.co/settings/tokens)\n",
     "\n",
-    "Step 2: Enter your token in the code cell below and run it. If you haven't accepted the terms earlier, you will get an error message with a link to accept the terms for the relevant model (Segmentation , Voice Activity Detection (VAD), and Speaker Diarization), follow the link and accept the terms and rerun the code cell below. You should now get another error with a new link. Repeat the process until you have accepted the terms for all models actually get output instead of an error message.  \n",
-    "\n",
-    "Use step 7 to save the diarized transcript in all file formats (make sure to change the na)."
+    "Step 2: Enter your token in the code cell below and run it. If you haven't accepted the terms earlier, you will get an error message with a link to accept the terms for the relevant model (Segmentation , Voice Activity Detection (VAD), and Speaker Diarization), follow the link and accept the terms and rerun the code cell below. You should now get another error with a new link. Repeat the process until you have accepted the terms for all models actually get output instead of an error message.  \n"
    ]
   },
   {
@@ -210,6 +208,48 @@
     "print(diarized_transcript[\"segments\"]) # segments are now assigned speaker IDs"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "f75b1f38-9662-43d3-8f27-aa8310d60d80",
+   "metadata": {},
+   "source": [
+    "## Save diarized transcripts\n",
+    "\n",
+    "Run the cell below to save the diarized transcript in `.json` and `.txt` format"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a46eaa00-b24c-4506-9f5e-5da0dff1b48d",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "output_directory = project_folder + \"/output/diarized_transcripts\"\n",
+    "\n",
+    "if not os.path.isdir(output_directory):\n",
+    "    os.makedirs(output_directory)\n",
+    "    \n",
+    "writer = whisper.utils.get_writer(\"json\", output_directory)\n",
+    "writer(diarized_transcript, audio_file, writer_options)\n",
+    "\n",
+    "import pandas as pd\n",
+    "import json\n",
+    "\n",
+    "file_no_ext = os.path.splitext(os.path.basename(audio_file))[0] \n",
+    "json_filename = file_no_ext + \".json\"\n",
+    "\n",
+    "with open(output_directory + json_filename) as json_file:\n",
+    "    data = json.load(json_file)\n",
+    "\n",
+    "df = pd.DataFrame.from_dict(data[\"segments\"], orient='columns')\n",
+    "df.to_csv(output_directory + file_no_ext+ '.txt', header=None, index=None, sep=';', mode='a')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "917876a6",