Skip to content

Commit

Permalink
Merge pull request #163 from ks6088ts-labs/feature/issue-157_speech-s…
Browse files Browse the repository at this point in the history
…ervice

WIP: add ai speech sample
  • Loading branch information
ks6088ts authored Oct 8, 2024
2 parents c2adbf6 + f74745c commit e4ead26
Show file tree
Hide file tree
Showing 7 changed files with 418 additions and 6 deletions.
1 change: 1 addition & 0 deletions .env.template
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ AZURE_BLOB_CONTAINER_NAME="audio"
# Azure AI Speech
AZURE_AI_SPEECH_API_ENDPOINT="https://<speech-api-name>.cognitiveservices.azure.com/"
AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY="<speech-api-subscription-key>"
AZURE_AI_SPEECH_API_REGION="eastus"

# Bing search resource
BING_SUBSCRIPTION_KEY="<bing-subscription-key>"
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,5 @@ generated/
*.jpg
*.jpeg
.chroma
.stop
.transcribed.txt
26 changes: 26 additions & 0 deletions apps/14_streamlit_azure_ai_speech/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Realtime transcription with Azure AI Speech Service

This app demonstrates how to use Azure AI Speech Service for realtime transcription.

## Prerequisites

- Python 3.10 or later
- Azure AI Speech Service
- Azure OpenAI Service

## Overview

```shell
# Speech to Text script
poetry run python apps/14_streamlit_azure_ai_speech/speech_to_text.py --help

# WIP: Streamlit app
poetry run python -m streamlit run apps/14_streamlit_azure_ai_speech/main.py
```

# References

- [How to recognize speech](https://learn.microsoft.com/azure/ai-services/speech-service/how-to-recognize-speech?pivots=programming-language-python)
- [Quickstart: Create real-time diarization](https://learn.microsoft.com/azure/ai-services/speech-service/get-started-stt-diarization?tabs=windows&pivots=programming-language-python)
- [Speech to text containers with Docker](https://learn.microsoft.com/azure/ai-services/speech-service/speech-container-stt?tabs=container&pivots=programming-language-python)
- [AzureSpeechService でリアルタイム議事録](https://zenn.dev/o_ken_surprise/articles/991f5b592b91ee)
210 changes: 210 additions & 0 deletions apps/14_streamlit_azure_ai_speech/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
import pathlib
import subprocess
from os import getenv

import streamlit as st
from dotenv import load_dotenv
from openai import AzureOpenAI

load_dotenv()

# Initialize the session state
if "transcribed_result" not in st.session_state:
st.session_state["transcribed_result"] = ""

with st.sidebar:
inference_type = st.selectbox(
label="INEFERENCE_TYPE",
options=[
"azure",
"local",
],
key="INEFERENCE_TYPE",
)
azure_ai_speech_api_language = st.selectbox(
label="AZURE_AI_SPEECH_API_LANGUAGE",
options=[
"en-US",
"ja-JP",
],
key="AZURE_AI_SPEECH_API_LANGUAGE",
)
if inference_type == "local":
path_to_model = st.text_input(
label="PATH_TO_MODEL",
value="./model",
key="PATH_TO_MODEL",
type="default",
)
stt_host = st.text_input(
label="STT_HOST",
value="ws://localhost:5000",
key="STT_HOST",
type="default",
)
st.warning("yet to be implemented")
if inference_type == "azure":
azure_openai_endpoint = st.text_input(
label="AZURE_OPENAI_ENDPOINT",
value=getenv("AZURE_OPENAI_ENDPOINT"),
key="AZURE_OPENAI_ENDPOINT",
type="default",
)
azure_openai_api_key = st.text_input(
label="AZURE_OPENAI_API_KEY",
value=getenv("AZURE_OPENAI_API_KEY"),
key="AZURE_OPENAI_API_KEY",
type="password",
)
azure_openai_api_version = st.text_input(
label="AZURE_OPENAI_API_VERSION",
value=getenv("AZURE_OPENAI_API_VERSION"),
key="AZURE_OPENAI_API_VERSION",
type="default",
)
azure_openai_gpt_model = st.text_input(
label="AZURE_OPENAI_GPT_MODEL",
value=getenv("AZURE_OPENAI_GPT_MODEL"),
key="AZURE_OPENAI_GPT_MODEL",
type="default",
)
azure_ai_speech_api_subscription_key = st.text_input(
label="AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY",
value=getenv("AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY"),
key="AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY",
type="password",
)
azure_ai_speech_api_region = st.text_input(
label="AZURE_AI_SPEECH_API_REGION",
value=getenv("AZURE_AI_SPEECH_API_REGION"),
key="AZURE_AI_SPEECH_API_REGION",
type="default",
)
"[Azure Portal](https://portal.azure.com/)"
"[Azure OpenAI Studio](https://oai.azure.com/resource/overview)"
"[View the source code](https://github.com/ks6088ts-labs/workshop-azure-openai/blob/main/apps/14_streamlit_azure_ai_speech/main.py)"


def is_configured():
if inference_type == "local":
return path_to_model and stt_host
if inference_type == "azure":
return azure_openai_api_key and azure_openai_endpoint and azure_openai_api_version and azure_openai_gpt_model


st.title("transcribe text")

if not is_configured():
st.warning("Please fill in the required fields at the sidebar.")

st.info("This is a sample to transcribe text.")

# ---
# 2 column layout

# 1st row
row1_left, row1_right = st.columns(2)
with row1_left:
input = st.text_area(
"Transcribed text",
height=400,
placeholder="Please enter the text to transcribe.",
key="input",
value=st.session_state["transcribed_result"],
)

with row1_right:
start_transcribe_button = st.button("start", disabled=not is_configured())
stop_transcribe_button = st.button("stop", disabled=not is_configured())
transcription_status = st.empty()

# line break horizontal line
st.markdown("---")

# 2nd row
row2_left, row2_right = st.columns(2)

with row2_left:
selected_task = st.selectbox(
"Task",
[
"Create summaries from the following text",
"Extract 3 main points from the following text",
# Add more tasks here
],
key="selected_task",
index=0,
)

with row2_right:
run_task_button = st.button("run_task", disabled=not is_configured())

path_to_transcribed_text = ".transcribed.txt"


def start_recognition():
global process
if inference_type == "local":
command = f"python apps/14_streamlit_azure_ai_speech/speech_to_text.py --output {path_to_transcribed_text} --endpoint {stt_host} --language {azure_ai_speech_api_language} --type local --verbose" # noqa
process = subprocess.Popen(command, shell=True)
st.warning("Local inference is not yet implemented.")
return
if inference_type == "azure":
command = f"python apps/14_streamlit_azure_ai_speech/speech_to_text.py --output {path_to_transcribed_text} --subscription {azure_ai_speech_api_subscription_key} --region {azure_ai_speech_api_region} --language {azure_ai_speech_api_language} --type azure --verbose" # noqa
process = subprocess.Popen(command, shell=True)


def run_task(selected_task: str, input: str) -> str:
if inference_type == "local":
st.warning("Local inference is not yet implemented.")
return
if inference_type == "azure":
client = AzureOpenAI(
api_key=azure_openai_api_key,
api_version=azure_openai_api_version,
azure_endpoint=azure_openai_endpoint,
)

response = client.chat.completions.create(
model=azure_openai_gpt_model,
messages=[
{
"role": "system",
"content": f"""
Task: {selected_task}.
---
{input}
---
""",
},
],
)
return response.choices[0].message.content
raise ValueError(f"Inference type is not supported: {inference_type}")


def load_transcribed_text():
with open(path_to_transcribed_text) as f:
return f.read()


if start_transcribe_button:
if not st.session_state.get("process"):
transcription_status.info(f"Transcribing... (language={azure_ai_speech_api_language})")
start_recognition()
else:
transcription_status.warning("Transcription is already running.")

if stop_transcribe_button:
pathlib.Path(".stop").touch()
output = load_transcribed_text()
st.session_state.transcribed_result = output
st.rerun()

if run_task_button:
with st.spinner("Running..."):
output = run_task(
selected_task=selected_task,
input=input,
)
st.write(output)
Loading

0 comments on commit e4ead26

Please sign in to comment.