From f05bcd9951bb2e86139c9e278145a95bd4fd69fa Mon Sep 17 00:00:00 2001 From: Neil Dwyer Date: Tue, 10 Sep 2024 11:47:10 -0700 Subject: [PATCH 1/3] Only send actual audio to Deepgram --- .../livekit/plugins/deepgram/stt.py | 8 ++++++- .../livekit/plugins/deepgram/utils.py | 21 +++++++++++++++++++ .../livekit-plugins-deepgram/setup.py | 2 +- 3 files changed, 29 insertions(+), 2 deletions(-) create mode 100644 livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/utils.py diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py index 53e2e1d55..2438a68cb 100644 --- a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py +++ b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py @@ -30,6 +30,7 @@ from .log import logger from .models import DeepgramLanguages, DeepgramModels +from .utils import BasicAudioEnergyFilter BASE_URL = "https://api.deepgram.com/v1/listen" BASE_URL_WS = "wss://api.deepgram.com/v1/listen" @@ -200,6 +201,9 @@ def __init__( self._session = http_session self._speaking = False self._max_retry = max_retry + self._audio_energy_filter = BasicAudioEnergyFilter( + threshold=0.1, cooldown_seconds=1 + ) @utils.log_exceptions(logger=logger) async def _main_task(self) -> None: @@ -294,7 +298,9 @@ async def send_task(): frames = audio_bstream.write(data.data.tobytes()) for frame in frames: - await ws.send_bytes(frame.data.tobytes()) + has_audio = self._audio_energy_filter.push_frame(frame) + if has_audio: + await ws.send_bytes(frame.data.tobytes()) # tell deepgram we are done sending audio/inputs closing_ws = True diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/utils.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/utils.py new file mode 100644 index 000000000..3cc8b464f --- /dev/null +++ b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/utils.py @@ -0,0 +1,21 @@ +import numpy as np +from livekit import rtc + + +class BasicAudioEnergyFilter: + def __init__(self, *, threshold: float = 0.1, cooldown_seconds: float = 1): + self.threshold = threshold + self._cooldown = 1 + + def push_frame(self, frame: rtc.AudioFrame) -> bool: + arr = np.frombuffer(frame.data, dtype=np.int16) + if np.sum(arr**2) > self.threshold: + self._cooldown = 1 + return True + + duration_seconds = frame.samples_per_channel / frame.sample_rate + self._cooldown -= duration_seconds + if self._cooldown > 0: + return True + + return False diff --git a/livekit-plugins/livekit-plugins-deepgram/setup.py b/livekit-plugins/livekit-plugins-deepgram/setup.py index 37b739565..521cb47dc 100644 --- a/livekit-plugins/livekit-plugins-deepgram/setup.py +++ b/livekit-plugins/livekit-plugins-deepgram/setup.py @@ -47,7 +47,7 @@ license="Apache-2.0", packages=setuptools.find_namespace_packages(include=["livekit.*"]), python_requires=">=3.9.0", - install_requires=["livekit-agents>=0.8.0.dev0"], + install_requires=["livekit-agents>=0.8.0", "numpy~=2.1"], package_data={"livekit.plugins.deepgram": ["py.typed"]}, project_urls={ "Documentation": "https://docs.livekit.io", From 0e42e1b4ed3b7c96bdfe0e7ff0780af83c104442 Mon Sep 17 00:00:00 2001 From: Neil Dwyer Date: Tue, 10 Sep 2024 12:17:32 -0700 Subject: [PATCH 2/3] configure magic number --- .../livekit/plugins/deepgram/stt.py | 4 +--- .../livekit/plugins/deepgram/utils.py | 16 +++++++++++----- .../livekit-plugins-deepgram/setup.py | 2 +- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py index 2438a68cb..b1d593abb 100644 --- a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py +++ b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py @@ -201,9 +201,7 @@ def __init__( self._session = http_session self._speaking = False self._max_retry = max_retry - self._audio_energy_filter = BasicAudioEnergyFilter( - threshold=0.1, cooldown_seconds=1 - ) + self._audio_energy_filter = BasicAudioEnergyFilter(cooldown_seconds=1) @utils.log_exceptions(logger=logger) async def _main_task(self) -> None: diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/utils.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/utils.py index 3cc8b464f..c9c9ee452 100644 --- a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/utils.py +++ b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/utils.py @@ -1,16 +1,22 @@ import numpy as np from livekit import rtc +# This is the magic number during testing that we use to determine if a frame is loud enough +# to possibly contain speech. It's very conservative. +MAGIC_NUMBER_THRESHOLD = 0.004 + class BasicAudioEnergyFilter: - def __init__(self, *, threshold: float = 0.1, cooldown_seconds: float = 1): - self.threshold = threshold - self._cooldown = 1 + def __init__(self, *, cooldown_seconds: float = 1): + self._cooldown_seconds = cooldown_seconds + self._cooldown = cooldown_seconds def push_frame(self, frame: rtc.AudioFrame) -> bool: arr = np.frombuffer(frame.data, dtype=np.int16) - if np.sum(arr**2) > self.threshold: - self._cooldown = 1 + float_arr = arr.astype(np.float32) / 32768.0 + rms = np.sqrt(np.mean(np.square(float_arr))) + if rms > MAGIC_NUMBER_THRESHOLD: + self._cooldown = self._cooldown_seconds return True duration_seconds = frame.samples_per_channel / frame.sample_rate diff --git a/livekit-plugins/livekit-plugins-deepgram/setup.py b/livekit-plugins/livekit-plugins-deepgram/setup.py index 521cb47dc..98a4b82ba 100644 --- a/livekit-plugins/livekit-plugins-deepgram/setup.py +++ b/livekit-plugins/livekit-plugins-deepgram/setup.py @@ -47,7 +47,7 @@ license="Apache-2.0", packages=setuptools.find_namespace_packages(include=["livekit.*"]), python_requires=">=3.9.0", - install_requires=["livekit-agents>=0.8.0", "numpy~=2.1"], + install_requires=["livekit-agents>=0.8.0", "numpy~=1.21"], package_data={"livekit.plugins.deepgram": ["py.typed"]}, project_urls={ "Documentation": "https://docs.livekit.io", From 0cea9e16edf15749d8e2dc09f7e1340c14fcc4d7 Mon Sep 17 00:00:00 2001 From: Neil Dwyer Date: Tue, 10 Sep 2024 12:20:54 -0700 Subject: [PATCH 3/3] Create hot-eagles-happen.md --- .changeset/hot-eagles-happen.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/hot-eagles-happen.md diff --git a/.changeset/hot-eagles-happen.md b/.changeset/hot-eagles-happen.md new file mode 100644 index 000000000..037e597d4 --- /dev/null +++ b/.changeset/hot-eagles-happen.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-deepgram": patch +--- + +Only send actual audio to Deepgram using a basic audio RMS filter