Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Only send actual audio to Deepgram #738

Merged
merged 3 commits into from
Sep 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/hot-eagles-happen.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"livekit-plugins-deepgram": patch
---

Only send actual audio to Deepgram using a basic audio RMS filter
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

from .log import logger
from .models import DeepgramLanguages, DeepgramModels
from .utils import BasicAudioEnergyFilter

BASE_URL = "https://api.deepgram.com/v1/listen"
BASE_URL_WS = "wss://api.deepgram.com/v1/listen"
Expand Down Expand Up @@ -200,6 +201,7 @@ def __init__(
self._session = http_session
self._speaking = False
self._max_retry = max_retry
self._audio_energy_filter = BasicAudioEnergyFilter(cooldown_seconds=1)

@utils.log_exceptions(logger=logger)
async def _main_task(self) -> None:
Expand Down Expand Up @@ -294,7 +296,9 @@ async def send_task():
frames = audio_bstream.write(data.data.tobytes())

for frame in frames:
await ws.send_bytes(frame.data.tobytes())
has_audio = self._audio_energy_filter.push_frame(frame)
if has_audio:
await ws.send_bytes(frame.data.tobytes())

# tell deepgram we are done sending audio/inputs
closing_ws = True
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import numpy as np
from livekit import rtc

# This is the magic number during testing that we use to determine if a frame is loud enough
# to possibly contain speech. It's very conservative.
MAGIC_NUMBER_THRESHOLD = 0.004


class BasicAudioEnergyFilter:
def __init__(self, *, cooldown_seconds: float = 1):
self._cooldown_seconds = cooldown_seconds
self._cooldown = cooldown_seconds

def push_frame(self, frame: rtc.AudioFrame) -> bool:
arr = np.frombuffer(frame.data, dtype=np.int16)
float_arr = arr.astype(np.float32) / 32768.0
rms = np.sqrt(np.mean(np.square(float_arr)))
if rms > MAGIC_NUMBER_THRESHOLD:
self._cooldown = self._cooldown_seconds
return True

duration_seconds = frame.samples_per_channel / frame.sample_rate
self._cooldown -= duration_seconds
if self._cooldown > 0:
return True

return False
2 changes: 1 addition & 1 deletion livekit-plugins/livekit-plugins-deepgram/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
license="Apache-2.0",
packages=setuptools.find_namespace_packages(include=["livekit.*"]),
python_requires=">=3.9.0",
install_requires=["livekit-agents>=0.8.0.dev0"],
install_requires=["livekit-agents>=0.8.0", "numpy~=1.21"],
package_data={"livekit.plugins.deepgram": ["py.typed"]},
project_urls={
"Documentation": "https://docs.livekit.io",
Expand Down
Loading