From f74745c985af8fb5e33803ae9055ff74d3144c16 Mon Sep 17 00:00:00 2001
From: ks6088ts <ks6088ts@gmail.com>
Date: Tue, 8 Oct 2024 07:06:57 +0900
Subject: [PATCH] add ai speech sample

---
 .env.template                                 |   1 +
 .gitignore                                    |   2 +
 apps/14_streamlit_azure_ai_speech/README.md   |  26 +++
 apps/14_streamlit_azure_ai_speech/main.py     | 210 ++++++++++++++++++
 .../speech_to_text.py                         | 157 +++++++++++++
 poetry.lock                                   |  27 ++-
 pyproject.toml                                |   1 +
 7 files changed, 418 insertions(+), 6 deletions(-)
 create mode 100644 apps/14_streamlit_azure_ai_speech/README.md
 create mode 100644 apps/14_streamlit_azure_ai_speech/main.py
 create mode 100644 apps/14_streamlit_azure_ai_speech/speech_to_text.py
diff --git a/.env.template b/.env.template
index 7db54a1..42e6011 100644
--- a/.env.template
+++ b/.env.template
@@ -31,6 +31,7 @@ AZURE_BLOB_CONTAINER_NAME="audio"
 # Azure AI Speech
 AZURE_AI_SPEECH_API_ENDPOINT="https://<speech-api-name>.cognitiveservices.azure.com/"
 AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY="<speech-api-subscription-key>"
+AZURE_AI_SPEECH_API_REGION="eastus"
 
 # Bing search resource
 BING_SUBSCRIPTION_KEY="<bing-subscription-key>"
diff --git a/.gitignore b/.gitignore
index 5454ece..99e3eac 100644
--- a/.gitignore
+++ b/.gitignore
@@ -167,3 +167,5 @@ generated/
 *.jpg
 *.jpeg
 .chroma
+.stop
+.transcribed.txt
diff --git a/apps/14_streamlit_azure_ai_speech/README.md b/apps/14_streamlit_azure_ai_speech/README.md
new file mode 100644
index 0000000..d865e5f
--- /dev/null
+++ b/apps/14_streamlit_azure_ai_speech/README.md
@@ -0,0 +1,26 @@
+# Realtime transcription with Azure AI Speech Service
+
+This app demonstrates how to use Azure AI Speech Service for realtime transcription.
+
+## Prerequisites
+
+- Python 3.10 or later
+- Azure AI Speech Service
+- Azure OpenAI Service
+
+## Overview
+
+```shell
+# Speech to Text script
+poetry run python apps/14_streamlit_azure_ai_speech/speech_to_text.py --help
+
+# WIP: Streamlit app
+poetry run python -m streamlit run apps/14_streamlit_azure_ai_speech/main.py
+```
+
+# References
+
+- [How to recognize speech](https://learn.microsoft.com/azure/ai-services/speech-service/how-to-recognize-speech?pivots=programming-language-python)
+- [Quickstart: Create real-time diarization](https://learn.microsoft.com/azure/ai-services/speech-service/get-started-stt-diarization?tabs=windows&pivots=programming-language-python)
+- [Speech to text containers with Docker](https://learn.microsoft.com/azure/ai-services/speech-service/speech-container-stt?tabs=container&pivots=programming-language-python)
+- [AzureSpeechService でリアルタイム議事録](https://zenn.dev/o_ken_surprise/articles/991f5b592b91ee)
diff --git a/apps/14_streamlit_azure_ai_speech/main.py b/apps/14_streamlit_azure_ai_speech/main.py
new file mode 100644
index 0000000..6630e60
--- /dev/null
+++ b/apps/14_streamlit_azure_ai_speech/main.py
@@ -0,0 +1,210 @@
+import pathlib
+import subprocess
+from os import getenv
+
+import streamlit as st
+from dotenv import load_dotenv
+from openai import AzureOpenAI
+
+load_dotenv()
+
+# Initialize the session state
+if "transcribed_result" not in st.session_state:
+    st.session_state["transcribed_result"] = ""
+
+with st.sidebar:
+    inference_type = st.selectbox(
+        label="INEFERENCE_TYPE",
+        options=[
+            "azure",
+            "local",
+        ],
+        key="INEFERENCE_TYPE",
+    )
+    azure_ai_speech_api_language = st.selectbox(
+        label="AZURE_AI_SPEECH_API_LANGUAGE",
+        options=[
+            "en-US",
+            "ja-JP",
+        ],
+        key="AZURE_AI_SPEECH_API_LANGUAGE",
+    )
+    if inference_type == "local":
+        path_to_model = st.text_input(
+            label="PATH_TO_MODEL",
+            value="./model",
+            key="PATH_TO_MODEL",
+            type="default",
+        )
+        stt_host = st.text_input(
+            label="STT_HOST",
+            value="ws://localhost:5000",
+            key="STT_HOST",
+            type="default",
+        )
+        st.warning("yet to be implemented")
+    if inference_type == "azure":
+        azure_openai_endpoint = st.text_input(
+            label="AZURE_OPENAI_ENDPOINT",
+            value=getenv("AZURE_OPENAI_ENDPOINT"),
+            key="AZURE_OPENAI_ENDPOINT",
+            type="default",
+        )
+        azure_openai_api_key = st.text_input(
+            label="AZURE_OPENAI_API_KEY",
+            value=getenv("AZURE_OPENAI_API_KEY"),
+            key="AZURE_OPENAI_API_KEY",
+            type="password",
+        )
+        azure_openai_api_version = st.text_input(
+            label="AZURE_OPENAI_API_VERSION",
+            value=getenv("AZURE_OPENAI_API_VERSION"),
+            key="AZURE_OPENAI_API_VERSION",
+            type="default",
+        )
+        azure_openai_gpt_model = st.text_input(
+            label="AZURE_OPENAI_GPT_MODEL",
+            value=getenv("AZURE_OPENAI_GPT_MODEL"),
+            key="AZURE_OPENAI_GPT_MODEL",
+            type="default",
+        )
+        azure_ai_speech_api_subscription_key = st.text_input(
+            label="AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY",
+            value=getenv("AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY"),
+            key="AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY",
+            type="password",
+        )
+        azure_ai_speech_api_region = st.text_input(
+            label="AZURE_AI_SPEECH_API_REGION",
+            value=getenv("AZURE_AI_SPEECH_API_REGION"),
+            key="AZURE_AI_SPEECH_API_REGION",
+            type="default",
+        )
+    "[Azure Portal](https://portal.azure.com/)"
+    "[Azure OpenAI Studio](https://oai.azure.com/resource/overview)"
+    "[View the source code](https://github.com/ks6088ts-labs/workshop-azure-openai/blob/main/apps/14_streamlit_azure_ai_speech/main.py)"
+
+
+def is_configured():
+    if inference_type == "local":
+        return path_to_model and stt_host
+    if inference_type == "azure":
+        return azure_openai_api_key and azure_openai_endpoint and azure_openai_api_version and azure_openai_gpt_model
+
+
+st.title("transcribe text")
+
+if not is_configured():
+    st.warning("Please fill in the required fields at the sidebar.")
+
+st.info("This is a sample to transcribe text.")
+
+# ---
+# 2 column layout
+
+# 1st row
+row1_left, row1_right = st.columns(2)
+with row1_left:
+    input = st.text_area(
+        "Transcribed text",
+        height=400,
+        placeholder="Please enter the text to transcribe.",
+        key="input",
+        value=st.session_state["transcribed_result"],
+    )
+
+with row1_right:
+    start_transcribe_button = st.button("start", disabled=not is_configured())
+    stop_transcribe_button = st.button("stop", disabled=not is_configured())
+    transcription_status = st.empty()
+
+# line break horizontal line
+st.markdown("---")
+
+# 2nd row
+row2_left, row2_right = st.columns(2)
+
+with row2_left:
+    selected_task = st.selectbox(
+        "Task",
+        [
+            "Create summaries from the following text",
+            "Extract 3 main points from the following text",
+            # Add more tasks here
+        ],
+        key="selected_task",
+        index=0,
+    )
+
+with row2_right:
+    run_task_button = st.button("run_task", disabled=not is_configured())
+
+path_to_transcribed_text = ".transcribed.txt"
+
+
+def start_recognition():
+    global process
+    if inference_type == "local":
+        command = f"python apps/14_streamlit_azure_ai_speech/speech_to_text.py --output {path_to_transcribed_text} --endpoint {stt_host} --language {azure_ai_speech_api_language} --type local --verbose"  # noqa
+        process = subprocess.Popen(command, shell=True)
+        st.warning("Local inference is not yet implemented.")
+        return
+    if inference_type == "azure":
+        command = f"python apps/14_streamlit_azure_ai_speech/speech_to_text.py --output {path_to_transcribed_text} --subscription {azure_ai_speech_api_subscription_key} --region {azure_ai_speech_api_region} --language {azure_ai_speech_api_language} --type azure --verbose"  # noqa
+        process = subprocess.Popen(command, shell=True)
+
+
+def run_task(selected_task: str, input: str) -> str:
+    if inference_type == "local":
+        st.warning("Local inference is not yet implemented.")
+        return
+    if inference_type == "azure":
+        client = AzureOpenAI(
+            api_key=azure_openai_api_key,
+            api_version=azure_openai_api_version,
+            azure_endpoint=azure_openai_endpoint,
+        )
+
+        response = client.chat.completions.create(
+            model=azure_openai_gpt_model,
+            messages=[
+                {
+                    "role": "system",
+                    "content": f"""
+                        Task: {selected_task}.
+                        ---
+                        {input}
+                        ---
+                    """,
+                },
+            ],
+        )
+        return response.choices[0].message.content
+    raise ValueError(f"Inference type is not supported: {inference_type}")
+
+
+def load_transcribed_text():
+    with open(path_to_transcribed_text) as f:
+        return f.read()
+
+
+if start_transcribe_button:
+    if not st.session_state.get("process"):
+        transcription_status.info(f"Transcribing... (language={azure_ai_speech_api_language})")
+        start_recognition()
+    else:
+        transcription_status.warning("Transcription is already running.")
+
+if stop_transcribe_button:
+    pathlib.Path(".stop").touch()
+    output = load_transcribed_text()
+    st.session_state.transcribed_result = output
+    st.rerun()
+
+if run_task_button:
+    with st.spinner("Running..."):
+        output = run_task(
+            selected_task=selected_task,
+            input=input,
+        )
+        st.write(output)
diff --git a/apps/14_streamlit_azure_ai_speech/speech_to_text.py b/apps/14_streamlit_azure_ai_speech/speech_to_text.py
new file mode 100644
index 0000000..2ba4499
--- /dev/null
+++ b/apps/14_streamlit_azure_ai_speech/speech_to_text.py
@@ -0,0 +1,157 @@
+import argparse
+import logging
+import os
+import time
+
+import azure.cognitiveservices.speech as speechsdk
+from dotenv import load_dotenv
+
+logger = logging.getLogger(__name__)
+
+
+outfilename = "output.txt"
+
+
+def init_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        prog="speech_to_text",
+        description="Azure AI Speech API Speech-to-Text",
+    )
+    parser.add_argument(
+        "-t",
+        "--type",
+        default="azure",
+        help="Inference type, either 'local' or 'azure'",
+    )
+    parser.add_argument(
+        "-e",
+        "--endpoint",
+        default="ws://localhost:5000",
+        help="Host address for local inference",
+    )
+    parser.add_argument(
+        "-s",
+        "--subscription",
+        default=os.getenv("AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY"),
+        help="Azure AI Speech API subscription key",
+    )
+    parser.add_argument(
+        "-r",
+        "--region",
+        default=os.getenv("AZURE_AI_SPEECH_API_REGION"),
+        help="Azure AI Speech API region",
+    )
+    parser.add_argument(
+        "-l",
+        "--language",
+        default="en-US",
+        help="Language code for speech recognition",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        default="output.txt",
+        help="Output file path",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Set verbose mode",
+    )
+    return parser.parse_args()
+
+
+def conversation_transcriber_recognition_canceled_cb(evt: speechsdk.SessionEventArgs):
+    logger.info("Canceled event")
+
+
+def conversation_transcriber_session_stopped_cb(evt: speechsdk.SessionEventArgs):
+    logger.info("SessionStopped event")
+
+
+def conversation_transcriber_transcribed_cb(evt: speechsdk.SpeechRecognitionEventArgs):
+    logger.info("TRANSCRIBED:")
+    if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
+        logger.info(f"\tText={evt.result.text}")
+        logger.info(f"\tSpeaker ID={evt.result.speaker_id}")
+        if evt.result.text != "":
+            with open(outfilename, "a") as f:
+                f.write(f"{evt.result.text}\n")
+    elif evt.result.reason == speechsdk.ResultReason.NoMatch:
+        logger.info(f"\tNOMATCH: Speech could not be TRANSCRIBED: {evt.result.no_match_details}")
+
+
+def conversation_transcriber_session_started_cb(evt: speechsdk.SessionEventArgs):
+    logger.info("SessionStarted event")
+
+
+def start_transcription(args: argparse.Namespace):
+    # FIXME: This is a workaround for setting the output file path
+    global outfilename
+    outfilename = args.output
+
+    speech_config = None
+    if args.type == "local":
+        speech_config = speechsdk.SpeechConfig(
+            host=args.endpoint,
+            speech_recognition_language=args.language,
+        )
+    if args.type == "azure":
+        speech_config = speechsdk.SpeechConfig(
+            subscription=args.subscription,
+            region=args.region,
+            speech_recognition_language=args.language,
+        )
+    if not speech_config:
+        raise ValueError(f"Invalid inference type: {args.type}")
+
+    conversation_transcriber = speechsdk.transcription.ConversationTranscriber(
+        speech_config=speech_config,
+    )
+
+    transcribing_stop = False
+
+    def stop_cb(evt: speechsdk.SessionEventArgs):
+        # """callback that signals to stop continuous recognition upon receiving an event `evt`"""
+        logger.info(f"CLOSING on {evt}")
+        nonlocal transcribing_stop
+        transcribing_stop = True
+
+    # Connect callbacks to the events fired by the conversation transcriber
+    conversation_transcriber.transcribed.connect(conversation_transcriber_transcribed_cb)
+    conversation_transcriber.session_started.connect(conversation_transcriber_session_started_cb)
+    conversation_transcriber.session_stopped.connect(conversation_transcriber_session_stopped_cb)
+    conversation_transcriber.canceled.connect(conversation_transcriber_recognition_canceled_cb)
+    # stop transcribing on either session stopped or canceled events
+    conversation_transcriber.session_stopped.connect(stop_cb)
+    conversation_transcriber.canceled.connect(stop_cb)
+
+    conversation_transcriber.start_transcribing_async()
+
+    # Waits for completion.
+    while not transcribing_stop:
+        if os.path.exists(".stop"):
+            logger.info("Stopping transcription...")
+            conversation_transcriber.stop_transcribing_async()
+            os.remove(".stop")
+            break
+        time.sleep(0.5)
+
+    conversation_transcriber.stop_transcribing_async()
+
+
+if __name__ == "__main__":
+    args = init_args()
+
+    # Set verbose mode
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+
+    # Parse .env file and set environment variables
+    load_dotenv()
+
+    try:
+        start_transcription(args=args)
+    except Exception as err:
+        logger.info(f"Encountered exception. {err}")
diff --git a/poetry.lock b/poetry.lock
index 7d40bce..ed87c17 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -314,6 +314,21 @@ azure-core = ">=1.30.0"
 isodate = ">=0.6.1"
 typing-extensions = ">=4.6.0"
 
+[[package]]
+name = "azure-cognitiveservices-speech"
+version = "1.40.0"
+description = "Microsoft Cognitive Services Speech SDK for Python"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "azure_cognitiveservices_speech-1.40.0-py3-none-macosx_10_14_x86_64.whl", hash = "sha256:c19e6e95db81e7b2931641ad9c945b8a914541feada9ae131a253ffdf4d731f4"},
+    {file = "azure_cognitiveservices_speech-1.40.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:5f9dcd42cb265ca2e7eb6e4b6a42dc0b2d5e889347cf48e0b254f6911bf88ace"},
+    {file = "azure_cognitiveservices_speech-1.40.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:9fce3f8b201129adec5d7d17ee004168cff332e74e658efce4fec665eefae54b"},
+    {file = "azure_cognitiveservices_speech-1.40.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:ad72f0c476ccbbe7a733e76ea4f6eaee2df1455b9e48ba118ba72ddd971c558a"},
+    {file = "azure_cognitiveservices_speech-1.40.0-py3-none-win32.whl", hash = "sha256:74e4ead7d685d96df86a31492faddabbeec44b2c9fb16b14c9cc19430a4aa9f8"},
+    {file = "azure_cognitiveservices_speech-1.40.0-py3-none-win_amd64.whl", hash = "sha256:8f17752cd89231dd9404b70b4832c17aa1f491326c163f53bfa2c0879c03f14f"},
+]
+
 [[package]]
 name = "azure-common"
 version = "1.1.28"
@@ -3018,8 +3033,8 @@ files = [
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.21.2", markers = "python_version >= \"3.10\" and python_version < \"3.11\""},
     {version = ">=1.23.3", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
+    {version = ">=1.21.2", markers = "python_version >= \"3.10\" and python_version < \"3.11\""},
     {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
 ]
 
@@ -3748,9 +3763,9 @@ files = [
 
 [package.dependencies]
 numpy = [
+    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
     {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
     {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
-    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
     {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
 ]
 
@@ -3772,9 +3787,9 @@ files = [
 
 [package.dependencies]
 numpy = [
+    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
     {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
     {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
-    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
     {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
 ]
 
@@ -3796,9 +3811,9 @@ files = [
 
 [package.dependencies]
 numpy = [
+    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
     {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
     {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
-    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
     {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
 ]
 
@@ -4127,8 +4142,8 @@ files = [
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
     {version = ">=1.23.2", markers = "python_version == \"3.11\""},
+    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
     {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
 ]
 python-dateutil = ">=2.8.2"
@@ -6921,4 +6936,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "2e39ea2e1b97d82541a775dc5e9ae7eb8f21a8855f73d176a132b5efeb240e76"
+content-hash = "60f93939b4d17185e172caa0b1ce678dba535c16932c7e773893cb018e0b504c"
diff --git a/pyproject.toml b/pyproject.toml
index c782790..693fa32 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ playwright = "^1.47.0"
 lxml = "^5.3.0"
 nest-asyncio = "^1.6.0"
 typer = "^0.12.5"
+azure-cognitiveservices-speech = "^1.40.0"
 
 [tool.poetry.group.dev.dependencies]
 pre-commit = "^3.8.0"