Skip to content

Commit

Permalink
Only send actual audio to Deepgram (#738)
Browse files Browse the repository at this point in the history
  • Loading branch information
keepingitneil authored Sep 10, 2024
1 parent 7b204f2 commit 893687e
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 2 deletions.
5 changes: 5 additions & 0 deletions .changeset/hot-eagles-happen.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"livekit-plugins-deepgram": patch
---

Only send actual audio to Deepgram using a basic audio RMS filter
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

from .log import logger
from .models import DeepgramLanguages, DeepgramModels
from .utils import BasicAudioEnergyFilter

BASE_URL = "https://api.deepgram.com/v1/listen"
BASE_URL_WS = "wss://api.deepgram.com/v1/listen"
Expand Down Expand Up @@ -200,6 +201,7 @@ def __init__(
self._session = http_session
self._speaking = False
self._max_retry = max_retry
self._audio_energy_filter = BasicAudioEnergyFilter(cooldown_seconds=1)

@utils.log_exceptions(logger=logger)
async def _main_task(self) -> None:
Expand Down Expand Up @@ -294,7 +296,9 @@ async def send_task():
frames = audio_bstream.write(data.data.tobytes())

for frame in frames:
await ws.send_bytes(frame.data.tobytes())
has_audio = self._audio_energy_filter.push_frame(frame)
if has_audio:
await ws.send_bytes(frame.data.tobytes())

# tell deepgram we are done sending audio/inputs
closing_ws = True
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import numpy as np
from livekit import rtc

# This is the magic number during testing that we use to determine if a frame is loud enough
# to possibly contain speech. It's very conservative.
MAGIC_NUMBER_THRESHOLD = 0.004


class BasicAudioEnergyFilter:
def __init__(self, *, cooldown_seconds: float = 1):
self._cooldown_seconds = cooldown_seconds
self._cooldown = cooldown_seconds

def push_frame(self, frame: rtc.AudioFrame) -> bool:
arr = np.frombuffer(frame.data, dtype=np.int16)
float_arr = arr.astype(np.float32) / 32768.0
rms = np.sqrt(np.mean(np.square(float_arr)))
if rms > MAGIC_NUMBER_THRESHOLD:
self._cooldown = self._cooldown_seconds
return True

duration_seconds = frame.samples_per_channel / frame.sample_rate
self._cooldown -= duration_seconds
if self._cooldown > 0:
return True

return False
2 changes: 1 addition & 1 deletion livekit-plugins/livekit-plugins-deepgram/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
license="Apache-2.0",
packages=setuptools.find_namespace_packages(include=["livekit.*"]),
python_requires=">=3.9.0",
install_requires=["livekit-agents>=0.8.0.dev0"],
install_requires=["livekit-agents>=0.8.0", "numpy~=1.21"],
package_data={"livekit.plugins.deepgram": ["py.typed"]},
project_urls={
"Documentation": "https://docs.livekit.io",
Expand Down

0 comments on commit 893687e

Please sign in to comment.