From 893687e73da1c2a25aaee775d349cbf63508aedf Mon Sep 17 00:00:00 2001 From: Neil Dwyer Date: Tue, 10 Sep 2024 16:08:44 -0700 Subject: [PATCH] Only send actual audio to Deepgram (#738) --- .changeset/hot-eagles-happen.md | 5 ++++ .../livekit/plugins/deepgram/stt.py | 6 ++++- .../livekit/plugins/deepgram/utils.py | 27 +++++++++++++++++++ .../livekit-plugins-deepgram/setup.py | 2 +- 4 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 .changeset/hot-eagles-happen.md create mode 100644 livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/utils.py diff --git a/.changeset/hot-eagles-happen.md b/.changeset/hot-eagles-happen.md new file mode 100644 index 000000000..037e597d4 --- /dev/null +++ b/.changeset/hot-eagles-happen.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-deepgram": patch +--- + +Only send actual audio to Deepgram using a basic audio RMS filter diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py index 53e2e1d55..b1d593abb 100644 --- a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py +++ b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py @@ -30,6 +30,7 @@ from .log import logger from .models import DeepgramLanguages, DeepgramModels +from .utils import BasicAudioEnergyFilter BASE_URL = "https://api.deepgram.com/v1/listen" BASE_URL_WS = "wss://api.deepgram.com/v1/listen" @@ -200,6 +201,7 @@ def __init__( self._session = http_session self._speaking = False self._max_retry = max_retry + self._audio_energy_filter = BasicAudioEnergyFilter(cooldown_seconds=1) @utils.log_exceptions(logger=logger) async def _main_task(self) -> None: @@ -294,7 +296,9 @@ async def send_task(): frames = audio_bstream.write(data.data.tobytes()) for frame in frames: - await ws.send_bytes(frame.data.tobytes()) + has_audio = self._audio_energy_filter.push_frame(frame) + if has_audio: + await ws.send_bytes(frame.data.tobytes()) # tell deepgram we are done sending audio/inputs closing_ws = True diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/utils.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/utils.py new file mode 100644 index 000000000..c9c9ee452 --- /dev/null +++ b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/utils.py @@ -0,0 +1,27 @@ +import numpy as np +from livekit import rtc + +# This is the magic number during testing that we use to determine if a frame is loud enough +# to possibly contain speech. It's very conservative. +MAGIC_NUMBER_THRESHOLD = 0.004 + + +class BasicAudioEnergyFilter: + def __init__(self, *, cooldown_seconds: float = 1): + self._cooldown_seconds = cooldown_seconds + self._cooldown = cooldown_seconds + + def push_frame(self, frame: rtc.AudioFrame) -> bool: + arr = np.frombuffer(frame.data, dtype=np.int16) + float_arr = arr.astype(np.float32) / 32768.0 + rms = np.sqrt(np.mean(np.square(float_arr))) + if rms > MAGIC_NUMBER_THRESHOLD: + self._cooldown = self._cooldown_seconds + return True + + duration_seconds = frame.samples_per_channel / frame.sample_rate + self._cooldown -= duration_seconds + if self._cooldown > 0: + return True + + return False diff --git a/livekit-plugins/livekit-plugins-deepgram/setup.py b/livekit-plugins/livekit-plugins-deepgram/setup.py index 37b739565..98a4b82ba 100644 --- a/livekit-plugins/livekit-plugins-deepgram/setup.py +++ b/livekit-plugins/livekit-plugins-deepgram/setup.py @@ -47,7 +47,7 @@ license="Apache-2.0", packages=setuptools.find_namespace_packages(include=["livekit.*"]), python_requires=">=3.9.0", - install_requires=["livekit-agents>=0.8.0.dev0"], + install_requires=["livekit-agents>=0.8.0", "numpy~=1.21"], package_data={"livekit.plugins.deepgram": ["py.typed"]}, project_urls={ "Documentation": "https://docs.livekit.io",