From f74745c985af8fb5e33803ae9055ff74d3144c16 Mon Sep 17 00:00:00 2001 From: ks6088ts Date: Tue, 8 Oct 2024 07:06:57 +0900 Subject: [PATCH] add ai speech sample --- .env.template | 1 + .gitignore | 2 + apps/14_streamlit_azure_ai_speech/README.md | 26 +++ apps/14_streamlit_azure_ai_speech/main.py | 210 ++++++++++++++++++ .../speech_to_text.py | 157 +++++++++++++ poetry.lock | 27 ++- pyproject.toml | 1 + 7 files changed, 418 insertions(+), 6 deletions(-) create mode 100644 apps/14_streamlit_azure_ai_speech/README.md create mode 100644 apps/14_streamlit_azure_ai_speech/main.py create mode 100644 apps/14_streamlit_azure_ai_speech/speech_to_text.py diff --git a/.env.template b/.env.template index 7db54a1..42e6011 100644 --- a/.env.template +++ b/.env.template @@ -31,6 +31,7 @@ AZURE_BLOB_CONTAINER_NAME="audio" # Azure AI Speech AZURE_AI_SPEECH_API_ENDPOINT="https://.cognitiveservices.azure.com/" AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY="" +AZURE_AI_SPEECH_API_REGION="eastus" # Bing search resource BING_SUBSCRIPTION_KEY="" diff --git a/.gitignore b/.gitignore index 5454ece..99e3eac 100644 --- a/.gitignore +++ b/.gitignore @@ -167,3 +167,5 @@ generated/ *.jpg *.jpeg .chroma +.stop +.transcribed.txt diff --git a/apps/14_streamlit_azure_ai_speech/README.md b/apps/14_streamlit_azure_ai_speech/README.md new file mode 100644 index 0000000..d865e5f --- /dev/null +++ b/apps/14_streamlit_azure_ai_speech/README.md @@ -0,0 +1,26 @@ +# Realtime transcription with Azure AI Speech Service + +This app demonstrates how to use Azure AI Speech Service for realtime transcription. + +## Prerequisites + +- Python 3.10 or later +- Azure AI Speech Service +- Azure OpenAI Service + +## Overview + +```shell +# Speech to Text script +poetry run python apps/14_streamlit_azure_ai_speech/speech_to_text.py --help + +# WIP: Streamlit app +poetry run python -m streamlit run apps/14_streamlit_azure_ai_speech/main.py +``` + +# References + +- [How to recognize speech](https://learn.microsoft.com/azure/ai-services/speech-service/how-to-recognize-speech?pivots=programming-language-python) +- [Quickstart: Create real-time diarization](https://learn.microsoft.com/azure/ai-services/speech-service/get-started-stt-diarization?tabs=windows&pivots=programming-language-python) +- [Speech to text containers with Docker](https://learn.microsoft.com/azure/ai-services/speech-service/speech-container-stt?tabs=container&pivots=programming-language-python) +- [AzureSpeechService でリアルタイム議事録](https://zenn.dev/o_ken_surprise/articles/991f5b592b91ee) diff --git a/apps/14_streamlit_azure_ai_speech/main.py b/apps/14_streamlit_azure_ai_speech/main.py new file mode 100644 index 0000000..6630e60 --- /dev/null +++ b/apps/14_streamlit_azure_ai_speech/main.py @@ -0,0 +1,210 @@ +import pathlib +import subprocess +from os import getenv + +import streamlit as st +from dotenv import load_dotenv +from openai import AzureOpenAI + +load_dotenv() + +# Initialize the session state +if "transcribed_result" not in st.session_state: + st.session_state["transcribed_result"] = "" + +with st.sidebar: + inference_type = st.selectbox( + label="INEFERENCE_TYPE", + options=[ + "azure", + "local", + ], + key="INEFERENCE_TYPE", + ) + azure_ai_speech_api_language = st.selectbox( + label="AZURE_AI_SPEECH_API_LANGUAGE", + options=[ + "en-US", + "ja-JP", + ], + key="AZURE_AI_SPEECH_API_LANGUAGE", + ) + if inference_type == "local": + path_to_model = st.text_input( + label="PATH_TO_MODEL", + value="./model", + key="PATH_TO_MODEL", + type="default", + ) + stt_host = st.text_input( + label="STT_HOST", + value="ws://localhost:5000", + key="STT_HOST", + type="default", + ) + st.warning("yet to be implemented") + if inference_type == "azure": + azure_openai_endpoint = st.text_input( + label="AZURE_OPENAI_ENDPOINT", + value=getenv("AZURE_OPENAI_ENDPOINT"), + key="AZURE_OPENAI_ENDPOINT", + type="default", + ) + azure_openai_api_key = st.text_input( + label="AZURE_OPENAI_API_KEY", + value=getenv("AZURE_OPENAI_API_KEY"), + key="AZURE_OPENAI_API_KEY", + type="password", + ) + azure_openai_api_version = st.text_input( + label="AZURE_OPENAI_API_VERSION", + value=getenv("AZURE_OPENAI_API_VERSION"), + key="AZURE_OPENAI_API_VERSION", + type="default", + ) + azure_openai_gpt_model = st.text_input( + label="AZURE_OPENAI_GPT_MODEL", + value=getenv("AZURE_OPENAI_GPT_MODEL"), + key="AZURE_OPENAI_GPT_MODEL", + type="default", + ) + azure_ai_speech_api_subscription_key = st.text_input( + label="AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY", + value=getenv("AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY"), + key="AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY", + type="password", + ) + azure_ai_speech_api_region = st.text_input( + label="AZURE_AI_SPEECH_API_REGION", + value=getenv("AZURE_AI_SPEECH_API_REGION"), + key="AZURE_AI_SPEECH_API_REGION", + type="default", + ) + "[Azure Portal](https://portal.azure.com/)" + "[Azure OpenAI Studio](https://oai.azure.com/resource/overview)" + "[View the source code](https://github.com/ks6088ts-labs/workshop-azure-openai/blob/main/apps/14_streamlit_azure_ai_speech/main.py)" + + +def is_configured(): + if inference_type == "local": + return path_to_model and stt_host + if inference_type == "azure": + return azure_openai_api_key and azure_openai_endpoint and azure_openai_api_version and azure_openai_gpt_model + + +st.title("transcribe text") + +if not is_configured(): + st.warning("Please fill in the required fields at the sidebar.") + +st.info("This is a sample to transcribe text.") + +# --- +# 2 column layout + +# 1st row +row1_left, row1_right = st.columns(2) +with row1_left: + input = st.text_area( + "Transcribed text", + height=400, + placeholder="Please enter the text to transcribe.", + key="input", + value=st.session_state["transcribed_result"], + ) + +with row1_right: + start_transcribe_button = st.button("start", disabled=not is_configured()) + stop_transcribe_button = st.button("stop", disabled=not is_configured()) + transcription_status = st.empty() + +# line break horizontal line +st.markdown("---") + +# 2nd row +row2_left, row2_right = st.columns(2) + +with row2_left: + selected_task = st.selectbox( + "Task", + [ + "Create summaries from the following text", + "Extract 3 main points from the following text", + # Add more tasks here + ], + key="selected_task", + index=0, + ) + +with row2_right: + run_task_button = st.button("run_task", disabled=not is_configured()) + +path_to_transcribed_text = ".transcribed.txt" + + +def start_recognition(): + global process + if inference_type == "local": + command = f"python apps/14_streamlit_azure_ai_speech/speech_to_text.py --output {path_to_transcribed_text} --endpoint {stt_host} --language {azure_ai_speech_api_language} --type local --verbose" # noqa + process = subprocess.Popen(command, shell=True) + st.warning("Local inference is not yet implemented.") + return + if inference_type == "azure": + command = f"python apps/14_streamlit_azure_ai_speech/speech_to_text.py --output {path_to_transcribed_text} --subscription {azure_ai_speech_api_subscription_key} --region {azure_ai_speech_api_region} --language {azure_ai_speech_api_language} --type azure --verbose" # noqa + process = subprocess.Popen(command, shell=True) + + +def run_task(selected_task: str, input: str) -> str: + if inference_type == "local": + st.warning("Local inference is not yet implemented.") + return + if inference_type == "azure": + client = AzureOpenAI( + api_key=azure_openai_api_key, + api_version=azure_openai_api_version, + azure_endpoint=azure_openai_endpoint, + ) + + response = client.chat.completions.create( + model=azure_openai_gpt_model, + messages=[ + { + "role": "system", + "content": f""" + Task: {selected_task}. + --- + {input} + --- + """, + }, + ], + ) + return response.choices[0].message.content + raise ValueError(f"Inference type is not supported: {inference_type}") + + +def load_transcribed_text(): + with open(path_to_transcribed_text) as f: + return f.read() + + +if start_transcribe_button: + if not st.session_state.get("process"): + transcription_status.info(f"Transcribing... (language={azure_ai_speech_api_language})") + start_recognition() + else: + transcription_status.warning("Transcription is already running.") + +if stop_transcribe_button: + pathlib.Path(".stop").touch() + output = load_transcribed_text() + st.session_state.transcribed_result = output + st.rerun() + +if run_task_button: + with st.spinner("Running..."): + output = run_task( + selected_task=selected_task, + input=input, + ) + st.write(output) diff --git a/apps/14_streamlit_azure_ai_speech/speech_to_text.py b/apps/14_streamlit_azure_ai_speech/speech_to_text.py new file mode 100644 index 0000000..2ba4499 --- /dev/null +++ b/apps/14_streamlit_azure_ai_speech/speech_to_text.py @@ -0,0 +1,157 @@ +import argparse +import logging +import os +import time + +import azure.cognitiveservices.speech as speechsdk +from dotenv import load_dotenv + +logger = logging.getLogger(__name__) + + +outfilename = "output.txt" + + +def init_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + prog="speech_to_text", + description="Azure AI Speech API Speech-to-Text", + ) + parser.add_argument( + "-t", + "--type", + default="azure", + help="Inference type, either 'local' or 'azure'", + ) + parser.add_argument( + "-e", + "--endpoint", + default="ws://localhost:5000", + help="Host address for local inference", + ) + parser.add_argument( + "-s", + "--subscription", + default=os.getenv("AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY"), + help="Azure AI Speech API subscription key", + ) + parser.add_argument( + "-r", + "--region", + default=os.getenv("AZURE_AI_SPEECH_API_REGION"), + help="Azure AI Speech API region", + ) + parser.add_argument( + "-l", + "--language", + default="en-US", + help="Language code for speech recognition", + ) + parser.add_argument( + "-o", + "--output", + default="output.txt", + help="Output file path", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Set verbose mode", + ) + return parser.parse_args() + + +def conversation_transcriber_recognition_canceled_cb(evt: speechsdk.SessionEventArgs): + logger.info("Canceled event") + + +def conversation_transcriber_session_stopped_cb(evt: speechsdk.SessionEventArgs): + logger.info("SessionStopped event") + + +def conversation_transcriber_transcribed_cb(evt: speechsdk.SpeechRecognitionEventArgs): + logger.info("TRANSCRIBED:") + if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech: + logger.info(f"\tText={evt.result.text}") + logger.info(f"\tSpeaker ID={evt.result.speaker_id}") + if evt.result.text != "": + with open(outfilename, "a") as f: + f.write(f"{evt.result.text}\n") + elif evt.result.reason == speechsdk.ResultReason.NoMatch: + logger.info(f"\tNOMATCH: Speech could not be TRANSCRIBED: {evt.result.no_match_details}") + + +def conversation_transcriber_session_started_cb(evt: speechsdk.SessionEventArgs): + logger.info("SessionStarted event") + + +def start_transcription(args: argparse.Namespace): + # FIXME: This is a workaround for setting the output file path + global outfilename + outfilename = args.output + + speech_config = None + if args.type == "local": + speech_config = speechsdk.SpeechConfig( + host=args.endpoint, + speech_recognition_language=args.language, + ) + if args.type == "azure": + speech_config = speechsdk.SpeechConfig( + subscription=args.subscription, + region=args.region, + speech_recognition_language=args.language, + ) + if not speech_config: + raise ValueError(f"Invalid inference type: {args.type}") + + conversation_transcriber = speechsdk.transcription.ConversationTranscriber( + speech_config=speech_config, + ) + + transcribing_stop = False + + def stop_cb(evt: speechsdk.SessionEventArgs): + # """callback that signals to stop continuous recognition upon receiving an event `evt`""" + logger.info(f"CLOSING on {evt}") + nonlocal transcribing_stop + transcribing_stop = True + + # Connect callbacks to the events fired by the conversation transcriber + conversation_transcriber.transcribed.connect(conversation_transcriber_transcribed_cb) + conversation_transcriber.session_started.connect(conversation_transcriber_session_started_cb) + conversation_transcriber.session_stopped.connect(conversation_transcriber_session_stopped_cb) + conversation_transcriber.canceled.connect(conversation_transcriber_recognition_canceled_cb) + # stop transcribing on either session stopped or canceled events + conversation_transcriber.session_stopped.connect(stop_cb) + conversation_transcriber.canceled.connect(stop_cb) + + conversation_transcriber.start_transcribing_async() + + # Waits for completion. + while not transcribing_stop: + if os.path.exists(".stop"): + logger.info("Stopping transcription...") + conversation_transcriber.stop_transcribing_async() + os.remove(".stop") + break + time.sleep(0.5) + + conversation_transcriber.stop_transcribing_async() + + +if __name__ == "__main__": + args = init_args() + + # Set verbose mode + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + + # Parse .env file and set environment variables + load_dotenv() + + try: + start_transcription(args=args) + except Exception as err: + logger.info(f"Encountered exception. {err}") diff --git a/poetry.lock b/poetry.lock index 7d40bce..ed87c17 100644 --- a/poetry.lock +++ b/poetry.lock @@ -314,6 +314,21 @@ azure-core = ">=1.30.0" isodate = ">=0.6.1" typing-extensions = ">=4.6.0" +[[package]] +name = "azure-cognitiveservices-speech" +version = "1.40.0" +description = "Microsoft Cognitive Services Speech SDK for Python" +optional = false +python-versions = ">=3.7" +files = [ + {file = "azure_cognitiveservices_speech-1.40.0-py3-none-macosx_10_14_x86_64.whl", hash = "sha256:c19e6e95db81e7b2931641ad9c945b8a914541feada9ae131a253ffdf4d731f4"}, + {file = "azure_cognitiveservices_speech-1.40.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:5f9dcd42cb265ca2e7eb6e4b6a42dc0b2d5e889347cf48e0b254f6911bf88ace"}, + {file = "azure_cognitiveservices_speech-1.40.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:9fce3f8b201129adec5d7d17ee004168cff332e74e658efce4fec665eefae54b"}, + {file = "azure_cognitiveservices_speech-1.40.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:ad72f0c476ccbbe7a733e76ea4f6eaee2df1455b9e48ba118ba72ddd971c558a"}, + {file = "azure_cognitiveservices_speech-1.40.0-py3-none-win32.whl", hash = "sha256:74e4ead7d685d96df86a31492faddabbeec44b2c9fb16b14c9cc19430a4aa9f8"}, + {file = "azure_cognitiveservices_speech-1.40.0-py3-none-win_amd64.whl", hash = "sha256:8f17752cd89231dd9404b70b4832c17aa1f491326c163f53bfa2c0879c03f14f"}, +] + [[package]] name = "azure-common" version = "1.1.28" @@ -3018,8 +3033,8 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.21.2", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.23.3", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=1.21.2", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] @@ -3748,9 +3763,9 @@ files = [ [package.dependencies] numpy = [ + {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] @@ -3772,9 +3787,9 @@ files = [ [package.dependencies] numpy = [ + {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] @@ -3796,9 +3811,9 @@ files = [ [package.dependencies] numpy = [ + {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] @@ -4127,8 +4142,8 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.22.4", markers = "python_version < \"3.11\""}, {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.22.4", markers = "python_version < \"3.11\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] python-dateutil = ">=2.8.2" @@ -6921,4 +6936,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "2e39ea2e1b97d82541a775dc5e9ae7eb8f21a8855f73d176a132b5efeb240e76" +content-hash = "60f93939b4d17185e172caa0b1ce678dba535c16932c7e773893cb018e0b504c" diff --git a/pyproject.toml b/pyproject.toml index c782790..693fa32 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ playwright = "^1.47.0" lxml = "^5.3.0" nest-asyncio = "^1.6.0" typer = "^0.12.5" +azure-cognitiveservices-speech = "^1.40.0" [tool.poetry.group.dev.dependencies] pre-commit = "^3.8.0"