Skip to content

Commit

Permalink
add whisper example
Browse files Browse the repository at this point in the history
  • Loading branch information
ks6088ts committed Oct 9, 2024
1 parent dbd2247 commit 9b28859
Show file tree
Hide file tree
Showing 4 changed files with 163 additions and 7 deletions.
4 changes: 4 additions & 0 deletions apps/16_whisper_transcription/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# References

- [openai/whisper](https://github.com/openai/whisper)
- [Improve --model argument handling and help message #1764](https://github.com/openai/whisper/pull/1764)
70 changes: 70 additions & 0 deletions apps/16_whisper_transcription/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import argparse
import logging

import whisper
from dotenv import load_dotenv


def init_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
prog="whisper_transcription",
description="Transcript with Whisper model",
)
parser.add_argument(
"-m",
"--model",
default="turbo",
help="Model name",
)
parser.add_argument(
"-f",
"--file",
default="dist/sample_audio.wav",
help="Audio file",
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
)
return parser.parse_args()


if __name__ == "__main__":
args = init_args()

# Set verbose mode
if args.verbose:
logging.basicConfig(level=logging.DEBUG)

# Parse .env file and set environment variables
load_dotenv()

model = whisper.load_model(name=args.model)

# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(
file=args.file,
)
audio = whisper.pad_or_trim(
array=audio,
length=30 * 16000,
)

# make log-Mel spectrogram and move to the same device as the model
# https://github.com/openai/whisper/pull/1764
mel = whisper.log_mel_spectrogram(
audio=audio,
n_mels=128,
).to(model.device)

# detect the spoken language
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)

# print the recognized text
print(result.text)
95 changes: 88 additions & 7 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ lxml = "^5.3.0"
nest-asyncio = "^1.6.0"
typer = "^0.12.5"
azure-cognitiveservices-speech = "^1.40.0"
openai-whisper = "^20240930"

[tool.poetry.group.dev.dependencies]
pre-commit = "^4.0.0"
Expand Down

0 comments on commit 9b28859

Please sign in to comment.