livekit · keepingitneil · Sep 10, 2024 · Sep 10, 2024 · Sep 10, 2024 · Sep 10, 2024
diff --git a/.changeset/hot-eagles-happen.md b/.changeset/hot-eagles-happen.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-deepgram": patch
+---
+
+Only send actual audio to Deepgram using a basic audio RMS filter
diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py
@@ -30,6 +30,7 @@
 
 from .log import logger
 from .models import DeepgramLanguages, DeepgramModels
+from .utils import BasicAudioEnergyFilter
 
 BASE_URL = "https://api.deepgram.com/v1/listen"
 BASE_URL_WS = "wss://api.deepgram.com/v1/listen"
@@ -200,6 +201,7 @@ def __init__(
         self._session = http_session
         self._speaking = False
         self._max_retry = max_retry
+        self._audio_energy_filter = BasicAudioEnergyFilter(cooldown_seconds=1)
 
     @utils.log_exceptions(logger=logger)
     async def _main_task(self) -> None:
@@ -294,7 +296,9 @@ async def send_task():
                     frames = audio_bstream.write(data.data.tobytes())
 
                 for frame in frames:
-                    await ws.send_bytes(frame.data.tobytes())
+                    has_audio = self._audio_energy_filter.push_frame(frame)
+                    if has_audio:
+                        await ws.send_bytes(frame.data.tobytes())
 
             # tell deepgram we are done sending audio/inputs
             closing_ws = True

diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/utils.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/utils.py
@@ -0,0 +1,27 @@
+import numpy as np
+from livekit import rtc
+
+# This is the magic number during testing that we use to determine if a frame is loud enough
+# to possibly contain speech. It's very conservative.
+MAGIC_NUMBER_THRESHOLD = 0.004
+
+
+class BasicAudioEnergyFilter:
+    def __init__(self, *, cooldown_seconds: float = 1):
+        self._cooldown_seconds = cooldown_seconds
+        self._cooldown = cooldown_seconds
+
+    def push_frame(self, frame: rtc.AudioFrame) -> bool:
+        arr = np.frombuffer(frame.data, dtype=np.int16)
+        float_arr = arr.astype(np.float32) / 32768.0
+        rms = np.sqrt(np.mean(np.square(float_arr)))
+        if rms > MAGIC_NUMBER_THRESHOLD:
+            self._cooldown = self._cooldown_seconds
+            return True
+
+        duration_seconds = frame.samples_per_channel / frame.sample_rate
+        self._cooldown -= duration_seconds
+        if self._cooldown > 0:
+            return True
+
+        return False
diff --git a/livekit-plugins/livekit-plugins-deepgram/setup.py b/livekit-plugins/livekit-plugins-deepgram/setup.py
@@ -47,7 +47,7 @@
     license="Apache-2.0",
     packages=setuptools.find_namespace_packages(include=["livekit.*"]),
     python_requires=">=3.9.0",
-    install_requires=["livekit-agents>=0.8.0.dev0"],
+    install_requires=["livekit-agents>=0.8.0", "numpy~=1.21"],
     package_data={"livekit.plugins.deepgram": ["py.typed"]},
     project_urls={
         "Documentation": "https://docs.livekit.io",