Update deepgram endpointing (#145)

* deepgram: Add min_silence_duration to deepgram client. deepgram controls vad by endpointing parameter, this fix allows to configure min_silence_duration in agents layer * add utterance_end_ms and speech_final to Deepgram plugin * add utterance_end_ms and speech_final to Deepgram plugin * expose speech_final as end_of_speech --------- Co-authored-by: Lam Nguyen <[email protected]>
livekit · Feb 6, 2024 · 604d7e3 · 604d7e3
1 parent 27ea3c1
commit 604d7e3
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 1 deletion.
diff --git a/livekit-agents/livekit/agents/stt/stt.py b/livekit-agents/livekit/agents/stt/stt.py
@@ -18,6 +18,7 @@ class SpeechData:
 class SpeechEvent:
     is_final: bool
     alternatives: List[SpeechData]
+    end_of_speech: bool = False
 
 
 class STT(ABC):

diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py
@@ -22,7 +22,7 @@ class STTOptions:
     punctuate: bool
     model: DeepgramModels
     smart_format: bool
-
+    endpointing: Optional[str]
 
 class STT(stt.STT):
     def __init__(
@@ -36,6 +36,7 @@ def __init__(
         model: DeepgramModels = "nova-2-general",
         api_key: Optional[str] = None,
         api_url: Optional[str] = None,
+        min_silence_duration: int = 10,
     ) -> None:
         super().__init__(streaming_supported=True)
         api_key = api_key or os.environ.get("DEEPGRAM_API_KEY")
@@ -51,6 +52,7 @@ def __init__(
             punctuate=punctuate,
             model=model,
             smart_format=smart_format,
+            endpointing=str(min_silence_duration),
         )
 
     def _sanitize_options(
@@ -197,6 +199,7 @@ async def on_transcript_received(
                     sample_rate=self._sample_rate,
                     smart_format=self._config.smart_format,
                     punctuate=self._config.punctuate,
+                    endpointing=self._config.endpointing,
                 )
                 await self._live.start(dg_opts)
                 opened = True
@@ -244,6 +247,7 @@ def live_transcription_to_speech_event(
 
     return stt.SpeechEvent(
         is_final=event.is_final or False,  # could be None?
+        end_of_speech=event.speech_final or False,
         alternatives=[
             stt.SpeechData(
                 language=language or "",