From 12a36d42c08f9843079f4dfc200a7b55a6bbbb64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Tue, 30 Jul 2024 22:56:25 +0200 Subject: [PATCH 01/24] voiceassistant: cleanup validation behaviour (#545) --- examples/voice-assistant/minimal_assistant.py | 3 + examples/voice-assistant/requirements.txt | 2 +- .../agents/voice_assistant/agent_output.py | 3 + .../agents/voice_assistant/voice_assistant.py | 354 +++++++++--------- 4 files changed, 193 insertions(+), 169 deletions(-) diff --git a/examples/voice-assistant/minimal_assistant.py b/examples/voice-assistant/minimal_assistant.py index 13b031ffe..976fb1243 100644 --- a/examples/voice-assistant/minimal_assistant.py +++ b/examples/voice-assistant/minimal_assistant.py @@ -1,9 +1,12 @@ import asyncio +from dotenv import load_dotenv from livekit.agents import AutoSubscribe, JobContext, WorkerOptions, cli, llm from livekit.agents.voice_assistant import VoiceAssistant from livekit.plugins import deepgram, openai, silero +load_dotenv() + async def entrypoint(ctx: JobContext): initial_ctx = llm.ChatContext().append( diff --git a/examples/voice-assistant/requirements.txt b/examples/voice-assistant/requirements.txt index f08e8e3f4..4e53fbbcb 100644 --- a/examples/voice-assistant/requirements.txt +++ b/examples/voice-assistant/requirements.txt @@ -2,4 +2,4 @@ livekit-agents>=0.8.2 livekit-plugins-openai>=0.7.1 livekit-plugins-deepgram>=0.6.1 livekit-plugins-silero>=0.6.1 - +python-dotenv~=1.0 \ No newline at end of file diff --git a/livekit-agents/livekit/agents/voice_assistant/agent_output.py b/livekit-agents/livekit/agents/voice_assistant/agent_output.py index 224a5238f..399d0e74f 100644 --- a/livekit-agents/livekit/agents/voice_assistant/agent_output.py +++ b/livekit-agents/livekit/agents/voice_assistant/agent_output.py @@ -63,6 +63,9 @@ def play(self) -> PlayoutHandle: def interrupt(self) -> None: """Interrupt the speech""" + if self.interrupted: + return + if self._play_handle is not None: self._play_handle.interrupt() diff --git a/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py b/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py index 324f20505..6f4e9fdb0 100644 --- a/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py +++ b/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py @@ -10,7 +10,7 @@ from .. import stt, tokenize, tts, utils, vad from ..llm import LLM, ChatContext, ChatMessage, FunctionContext, LLMStream -from .agent_output import AgentOutput, SpeechSource, SynthesisHandle +from .agent_output import AgentOutput, SynthesisHandle from .cancellable_source import CancellableAudioSource from .human_input import HumanInput from .log import logger @@ -20,11 +20,14 @@ @dataclass class _SpeechInfo: source: str | LLMStream | AsyncIterable[str] - user_question: str # empty when the speech isn't an answer allow_interruptions: bool add_to_chat_ctx: bool synthesis_handle: SynthesisHandle + # is_reply = True when the speech is answering to a user question + is_reply: bool = False + user_question: str = "" + WillSynthesizeAssistantReply = Callable[ ["VoiceAssistant", ChatContext], @@ -111,6 +114,9 @@ class AssistantTranscriptionOptions: class VoiceAssistant(utils.EventEmitter[EventTypes]): + MIN_TIME_PLAYED_FOR_COMMIT = 1.5 + """Minimum time played for the user speech to be committed to the chat context""" + def __init__( self, *, @@ -129,6 +135,27 @@ def __init__( plotting: bool = False, loop: asyncio.AbstractEventLoop | None = None, ) -> None: + """ + Create a new VoiceAssistant. + + Args: + vad: Voice Activity Detection (VAD) instance. + stt: Speech-to-Text (STT) instance. + llm: Large Language Model (LLM) instance. + tts: Text-to-Speech (TTS) instance. + chat_ctx: Chat context for the assistant. + fnc_ctx: Function context for the assistant. + allow_interruptions: Whether to allow the user to interrupt the assistant. + interrupt_speech_duration: Minimum duration of speech to consider for interruption. + interrupt_min_words: Minimum number of words to consider for interruption. + Defaults to 0 as this may increase the latency depending on the STT. + preemptive_synthesis: Whether to preemptively synthesize responses. + transcription: Options for assistant transcription. + will_synthesize_assistant_reply: Callback called when the assistant is about to synthesize a reply. + This can be used to customize the reply (e.g: inject context/RAG). + plotting: Whether to enable plotting for debugging. matplotlib must be installed. + loop: Event loop to use. Default to asyncio.get_event_loop(). + """ super().__init__() self._loop = loop or asyncio.get_event_loop() self._opts = _ImplOptions( @@ -142,8 +169,8 @@ def __init__( ) self._plotter = AssistantPlotter(self._loop) - # wrap with StreamAdapter automatically when streaming is not supported on a specific TTS - # to override StreamAdapter options, create the adapter manually + # wrap with StreamAdapter automatically when streaming is not supported on a specific TTS/STT. + # To override StreamAdapter options, create the adapter manually. if not tts.capabilities.streaming: from .. import tts as text_to_speech @@ -167,19 +194,23 @@ def __init__( self._human_input: HumanInput | None = None self._agent_output: AgentOutput | None = None + + # done when the agent output track is published self._track_published_fut = asyncio.Future[None]() - self._agent_answer_speech: _SpeechInfo | None = None - self._agent_playing_speech: _SpeechInfo | None = None - self._agent_answer_atask: asyncio.Task[None] | None = None - self._playout_ch = utils.aio.Chan[_SpeechInfo]() + self._pending_agent_reply: _SpeechInfo | None = None + self._pending_agent_reply_task: asyncio.Task[None] | None = None + self._playing_speech: _SpeechInfo | None = None self._transcribed_text, self._transcribed_interim_text = "", "" - self._deferred_validation = _DeferredAnswerValidation( - self._validate_answer_if_needed, loop=self._loop + self._deferred_validation = _DeferredReplyValidation( + self._validate_reply_if_possible, loop=self._loop ) + self._speech_q: list[_SpeechInfo] = [] + self._speech_q_changed = asyncio.Event() + @property def fnc_ctx(self) -> FunctionContext | None: return self._fnc_ctx @@ -230,48 +261,13 @@ def start( else: self._link_participant(participant) else: - # no participant provided, try to find the first in the room + # no participant provided, try to find the first participant in the room for participant in self._room.remote_participants.values(): self._link_participant(participant.identity) break self._main_atask = asyncio.create_task(self._main_task()) - async def say( - self, - source: str | LLMStream | AsyncIterable[str], - *, - allow_interruptions: bool = True, - add_to_chat_ctx: bool = True, - ) -> None: - """ - Make the assistant say something. - The source can be a string, an LLMStream or an AsyncIterable[str] - - Args: - source: the source of the speech - allow_interruptions: whether the speech can be interrupted - add_to_chat_ctx: whether to add the speech to the chat context - """ - await self._track_published_fut - assert ( - self._agent_output is not None - ), "agent output should be initialized when ready" - - speech_source = source - if isinstance(speech_source, LLMStream): - speech_source = _llm_stream_to_str_iterable(speech_source) - - synthesis_handle = self._agent_synthesize(transcript=speech_source) - speech = _SpeechInfo( - source=source, - user_question="", - allow_interruptions=allow_interruptions, - add_to_chat_ctx=add_to_chat_ctx, - synthesis_handle=synthesis_handle, - ) - self._playout_ch.send_nowait(speech) - def on(self, event: EventTypes, callback: Callable[[Any], None] | None = None): """Register a callback for an event @@ -290,6 +286,32 @@ def on(self, event: EventTypes, callback: Callable[[Any], None] | None = None): """ return super().on(event, callback) + async def say( + self, + source: str | LLMStream | AsyncIterable[str], + *, + allow_interruptions: bool = True, + add_to_chat_ctx: bool = True, + ) -> None: + """ + Play a speech source through the voice assistant. + + Args: + source: The source of the speech to play. + It can be a string, an LLMStream, or an asynchronous iterable of strings. + allow_interruptions: Whether to allow interruptions during the speech playback. + add_to_chat_ctx: Whether to add the speech to the chat context. + """ + await self._track_published_fut + self._add_speech_for_playout( + _SpeechInfo( + source=source, + allow_interruptions=allow_interruptions, + add_to_chat_ctx=add_to_chat_ctx, + synthesis_handle=self._synthesize_agent_speech(source), + ) + ) + async def aclose(self) -> None: """Close the voice assistant""" if not self._started: @@ -341,7 +363,7 @@ def _on_vad_updated(ev: vad.VADEvent) -> None: self._plotter.plot_value("vad_probability", ev.probability) if ev.speech_duration >= self._opts.int_speech_duration: - self._interrupt_if_needed() + self._interrupt_if_possible() def _on_end_of_speech(ev: vad.VADEvent) -> None: self._plotter.plot_event("user_stopped_speaking") @@ -355,9 +377,9 @@ def _on_final_transcript(ev: stt.SpeechEvent) -> None: self._transcribed_text += ev.alternatives[0].text if self._opts.preemptive_synthesis: - self._synthesize_answer( - user_transcript=self._transcribed_text, force_play=False - ) + self._synthesize_agent_reply() + + self._deferred_validation.on_human_final_transcript(ev.alternatives[0].text) self._human_input.on("start_of_speech", _on_start_of_speech) self._human_input.on("vad_inference_done", _on_vad_updated) @@ -397,63 +419,25 @@ def _on_playout_stopped(cancelled: bool) -> None: self._track_published_fut.set_result(None) - # play validated speeches - async for speech in self._playout_ch: - self._agent_playing_speech = speech - await self._play_speech(speech) - self._agent_playing_speech = None - - def _validate_answer_if_needed(self) -> None: - """ - Check if the user speech should be validated/played - """ - if ( - self._agent_answer_speech is not None - and not self._agent_answer_speech.synthesis_handle.interrupted - ): - self._playout_ch.send_nowait(self._agent_answer_speech) - self._agent_answer_speech = None - elif not self._opts.preemptive_synthesis and self._transcribed_text: - self._synthesize_answer( - user_transcript=self._transcribed_text, force_play=True - ) - - def _interrupt_if_needed(self) -> None: - """ - Check whether the current assistant speech should be interrupted - """ - if ( - self._agent_playing_speech is None - or not self._agent_playing_speech.allow_interruptions - or self._agent_playing_speech.synthesis_handle.interrupted - ): - return + while True: + await self._speech_q_changed.wait() - if self._opts.int_min_words != 0: - # check the final/interim transcribed text for the minimum word count - # to interrupt the agent speech - interim_words = self._opts.transcription.word_tokenizer.tokenize( - text=self._transcribed_interim_text - ) - if len(interim_words) < self._opts.int_min_words: - return + while self._speech_q: + speech = self._speech_q.pop(0) + self._playing_speech = speech + await self._play_speech(speech) + self._playing_speech = None - self._agent_playing_speech.synthesis_handle.interrupt() + self._speech_q_changed.clear() - def _synthesize_answer(self, *, user_transcript: str, force_play: bool) -> None: - """ - Synthesize the answer to the user question and make sure - only one answer is synthesized at a time - """ + def _synthesize_agent_reply(self, *, validated: bool = False) -> None: + """Synthesize the agent reply to the user question, also make sure only one reply + is synthesized/played at a time""" @utils.log_exceptions(logger=logger) - async def _synthesize_answer_task(old_task: asyncio.Task[None]) -> None: - # Use an async task to synthesize the agent answer to - # allow users to execute async code inside the will_create_llm_stream callback - assert ( - self._agent_output is not None - ), "agent output should be initialized when ready" - + async def _synthesize_answer_task( + old_task: asyncio.Task[None], user_transcript: str + ) -> None: if old_task is not None: await utils.aio.gracefully_cancel(old_task) @@ -471,53 +455,45 @@ async def _synthesize_answer_task(old_task: asyncio.Task[None]) -> None: self, chat_ctx=copied_ctx ) - synthesis = self._agent_synthesize( - transcript=_llm_stream_to_str_iterable(llm_stream) - ) - self._agent_answer_speech = _SpeechInfo( + reply = _SpeechInfo( source=llm_stream, - user_question=user_transcript, allow_interruptions=self._opts.allow_interruptions, add_to_chat_ctx=True, - synthesis_handle=synthesis, + synthesis_handle=self._synthesize_agent_speech(llm_stream), + is_reply=True, + user_question=user_transcript, ) - self._deferred_validation.on_new_synthesis(user_transcript) - - if force_play: - self._playout_ch.send_nowait(self._agent_answer_speech) - if self._agent_answer_speech is not None: - self._agent_answer_speech.synthesis_handle.interrupt() + if validated: + self._add_speech_for_playout(reply) + else: + self._pending_agent_reply = reply - self._agent_answer_speech = None - old_task = self._agent_answer_atask + # interrupt the current reply synthesis + if self._pending_agent_reply is not None: + self._pending_agent_reply.synthesis_handle.interrupt() + self._pending_agent_reply = None - self._agent_answer_atask = asyncio.create_task( - _synthesize_answer_task(old_task) + self._pending_agent_reply_task = asyncio.create_task( + _synthesize_answer_task( + self._pending_agent_reply_task, self._transcribed_text + ) ) async def _play_speech(self, speech_info: _SpeechInfo) -> None: - logger.debug("VoiceAssistant._play_speech started") - - assert self._agent_playing_speech is not None - - MIN_TIME_PLAYED_FOR_COMMIT = 1.5 - - assert ( - self._agent_output is not None - ), "agent output should be initialized when ready" - synthesis_handle = speech_info.synthesis_handle if synthesis_handle.interrupted: return + logger.debug("VoiceAssistant._play_speech started") + user_question = speech_info.user_question user_speech_commited = False play_handle = synthesis_handle.play() join_fut = play_handle.join() - def _commit_user_message_if_needed() -> None: + def _commit_user_question_if_needed() -> None: nonlocal user_speech_commited if ( @@ -535,7 +511,7 @@ def _commit_user_message_if_needed() -> None: # since we try to validate as fast as possible it is possible the agent gets interrupted # really quickly (barely audible), we don't want to mark this question as "answered". if not is_using_tools and ( - play_handle.time_played < MIN_TIME_PLAYED_FOR_COMMIT + play_handle.time_played < self.MIN_TIME_PLAYED_FOR_COMMIT and not join_fut.done() ): return @@ -553,9 +529,9 @@ def _commit_user_message_if_needed() -> None: [join_fut], return_when=asyncio.FIRST_COMPLETED, timeout=1.0 ) - _commit_user_message_if_needed() + _commit_user_question_if_needed() - _commit_user_message_if_needed() + _commit_user_question_if_needed() collected_text = speech_info.synthesis_handle.collected_text interrupted = speech_info.synthesis_handle.interrupted @@ -571,7 +547,7 @@ def _commit_user_message_if_needed() -> None: assert isinstance(speech_info.source, LLMStream) assert ( user_speech_commited - ), "user speech should be committed before using tools" + ), "user speech should have been committed before using tools" # execute functions call_ctx = AssistantCallContext(self, speech_info.source) @@ -603,13 +579,12 @@ def _commit_user_message_if_needed() -> None: chat_ctx = speech_info.source.chat_ctx.copy() chat_ctx.messages.extend(extra_tools_messages) - answer_stream = self._llm.chat(chat_ctx=chat_ctx, fnc_ctx=self._fnc_ctx) - answer_synthesis = self._agent_synthesize( - transcript=_llm_stream_to_str_iterable(answer_stream) + answer_llm_stream = self._llm.chat( + chat_ctx=chat_ctx, fnc_ctx=self._fnc_ctx ) - # make sure users can interrupt the fnc calls answer - # TODO(theomonnom): maybe we should add a new fnc_call_answer field to _SpeechInfo? - self._agent_playing_speech.synthesis_handle = answer_synthesis + answer_synthesis = self._synthesize_agent_speech(answer_llm_stream) + # replace the synthesis handle with the new one to allow interruption + speech_info.synthesis_handle = answer_synthesis play_handle = answer_synthesis.play() await play_handle.join() @@ -629,13 +604,19 @@ def _commit_user_message_if_needed() -> None: logger.debug("VoiceAssistant._play_speech ended") - def _agent_synthesize(self, *, transcript: SpeechSource) -> SynthesisHandle: + def _synthesize_agent_speech( + self, + source: str | LLMStream | AsyncIterable[str], + ) -> SynthesisHandle: assert ( self._agent_output is not None ), "agent output should be initialized when ready" + if isinstance(source, LLMStream): + source = _llm_stream_to_str_iterable(source) + return self._agent_output.synthesize( - transcript=transcript, + transcript=source, transcription=self._opts.transcription.agent_transcription, transcription_speed=self._opts.transcription.agent_transcription_speed, sentence_tokenizer=self._opts.transcription.sentence_tokenizer, @@ -643,6 +624,45 @@ def _agent_synthesize(self, *, transcript: SpeechSource) -> SynthesisHandle: hyphenate_word=self._opts.transcription.hyphenate_word, ) + def _validate_reply_if_possible(self) -> None: + """Check if the new agent speech should be played""" + if ( + self._pending_agent_reply is not None + and not self._pending_agent_reply.synthesis_handle.interrupted + ): + self._add_speech_for_playout(self._pending_agent_reply) + self._pending_agent_reply = None + elif not self._opts.preemptive_synthesis and self._transcribed_text: + # validated=True is going to call _add_speech_for_playout + self._synthesize_agent_reply(validated=True) + + # self._transcribed_text is reset after MIN_TIME_PLAYED_FOR_COMMIT, see self._play_speech + self._transcribed_interim_text = "" + + def _interrupt_if_possible(self) -> None: + """Check whether the current assistant speech should be interrupted""" + if ( + self._playing_speech is None + or not self._playing_speech.allow_interruptions + or self._playing_speech.synthesis_handle.interrupted + ): + return + + if self._opts.int_min_words != 0: + # check the final/interim transcribed text for the minimum word count + # to interrupt the agent speech + interim_words = self._opts.transcription.word_tokenizer.tokenize( + text=self._transcribed_interim_text + ) + if len(interim_words) < self._opts.int_min_words: + return + + self._playing_speech.synthesis_handle.interrupt() + + def _add_speech_for_playout(self, speech: _SpeechInfo) -> None: + self._speech_q.append(speech) + self._speech_q_changed.set() + async def _llm_stream_to_str_iterable(stream: LLMStream) -> AsyncIterable[str]: async for chunk in stream: @@ -653,12 +673,12 @@ async def _llm_stream_to_str_iterable(stream: LLMStream) -> AsyncIterable[str]: yield content -class _DeferredAnswerValidation: - # if the STT gives us punctuation, we can validate faster, we can be more confident - # about the end of the sentence (naive way to increase the default DEFER_DELAY to allow the user - # to say longer sentences without being interrupted by the assistant) +class _DeferredReplyValidation: + """This class is used to try to find the best time to validate the agent reply.""" + + # if the STT gives us punctuation, we can try validate the reply faster. PUNCTUATION = ".!?" - DEFER_DELAY_WITH_PUNCTUATION = 0.15 + DEFER_DELAY_WITH_PUNCTUATION = 0.1 DEFER_DELAY = 0.2 LATE_TRANSCRIPT_TOLERANCE = 5 @@ -676,21 +696,8 @@ def __init__( def validating(self) -> bool: return self._validating_task is not None and not self._validating_task.done() - def _get_defer_delay(self) -> float: - if ( - self._last_final_transcript - and self._last_final_transcript[-1] in self.PUNCTUATION - ): - return self.DEFER_DELAY_WITH_PUNCTUATION - - return self.DEFER_DELAY - - def _reset_states(self) -> None: - self._last_final_transcript = "" - self._last_recv_end_of_speech_time = 0.0 - - def on_new_synthesis(self, user_msg: str) -> None: - self._last_final_transcript = user_msg.strip() # type: ignore + def on_human_final_transcript(self, transcript: str) -> None: + self._last_final_transcript = transcript.strip() # type: ignore if self.validating: self._run(self._get_defer_delay()) # debounce @@ -719,16 +726,27 @@ async def aclose(self) -> None: await self._tasks_set.aclose() - @utils.log_exceptions(logger=logger) - async def _run_task(self, delay: float) -> None: - await asyncio.sleep(delay) + def _get_defer_delay(self) -> float: + if ( + self._last_final_transcript + and self._last_final_transcript[-1] in self.PUNCTUATION + ): + return self.DEFER_DELAY_WITH_PUNCTUATION + + return self.DEFER_DELAY + + def _reset_states(self) -> None: self._last_final_transcript = "" - self._received_end_of_speech = False - self._validate_fnc() - logger.debug("_DeferredAnswerValidation speech validated") + self._last_recv_end_of_speech_time = 0.0 def _run(self, delay: float) -> None: + @utils.log_exceptions(logger=logger) + async def _run_task(delay: float) -> None: + await asyncio.sleep(delay) + self._reset_states() + self._validate_fnc() + if self._validating_task is not None: self._validating_task.cancel() - self._validating = self._tasks_set.create_task(self._run_task(delay)) + self._validating = self._tasks_set.create_task(_run_task(delay)) From 5fd1b51e8e49903ef5b9b905d70762ee0f532ce9 Mon Sep 17 00:00:00 2001 From: Neil Dwyer Date: Tue, 30 Jul 2024 15:33:59 -0700 Subject: [PATCH 02/24] Remove deployed dir (#551) --- examples/_deployed/README.md | 1 - examples/_deployed/kitt/Dockerfile | 47 -------- examples/_deployed/kitt/kitt.py | 145 ----------------------- examples/_deployed/kitt/requirements.txt | 6 - 4 files changed, 199 deletions(-) delete mode 100644 examples/_deployed/README.md delete mode 100644 examples/_deployed/kitt/Dockerfile delete mode 100644 examples/_deployed/kitt/kitt.py delete mode 100644 examples/_deployed/kitt/requirements.txt diff --git a/examples/_deployed/README.md b/examples/_deployed/README.md deleted file mode 100644 index afc3e44ae..000000000 --- a/examples/_deployed/README.md +++ /dev/null @@ -1 +0,0 @@ -# The source code of our deployed examples diff --git a/examples/_deployed/kitt/Dockerfile b/examples/_deployed/kitt/Dockerfile deleted file mode 100644 index 099bd1cca..000000000 --- a/examples/_deployed/kitt/Dockerfile +++ /dev/null @@ -1,47 +0,0 @@ -# syntax=docker/dockerfile:1 - -ARG PYTHON_VERSION=3.11.6 -FROM python:${PYTHON_VERSION}-slim - -# Prevents Python from writing pyc files. -ENV PYTHONDONTWRITEBYTECODE=1 - -# Keeps Python from buffering stdout and stderr to avoid situations where -# the application crashes without emitting any logs due to buffering. -ENV PYTHONUNBUFFERED=1 - -# Create a non-privileged user that the app will run under. -# See https://docs.docker.com/develop/develop-images/dockerfile_best-practices/#user -ARG UID=10001 -RUN adduser \ - --disabled-password \ - --gecos "" \ - --home "/home/appuser" \ - --shell "/sbin/nologin" \ - --uid "${UID}" \ - appuser - - -# Install gcc and other build dependencies. -RUN apt-get update && \ - apt-get install -y \ - gcc \ - python3-dev \ - && rm -rf /var/lib/apt/lists/* - -USER appuser - -RUN mkdir -p /home/appuser/.cache -RUN chown -R appuser /home/appuser/.cache - -WORKDIR /home/appuser - -COPY requirements.txt . -RUN python -m pip install --user --no-cache-dir -r requirements.txt - -COPY . . - -RUN python kitt.py download-files - -# Run the application. -CMD python kitt.py start \ No newline at end of file diff --git a/examples/_deployed/kitt/kitt.py b/examples/_deployed/kitt/kitt.py deleted file mode 100644 index 0218241bd..000000000 --- a/examples/_deployed/kitt/kitt.py +++ /dev/null @@ -1,145 +0,0 @@ -import asyncio -import copy -import logging -from collections import deque -from typing import Annotated, List - -from livekit import agents, rtc -from livekit.agents import JobContext, JobRequest, WorkerOptions, cli -from livekit.agents.llm import ChatContext, ChatMessage, ChatRole -from livekit.agents.voice_assistant import AssistantContext, VoiceAssistant -from livekit.plugins import deepgram, elevenlabs, openai, silero - -MAX_IMAGES = 3 -NO_IMAGE_MESSAGE_GENERIC = ( - "I'm sorry, I don't have an image to process. Are you publishing your video?" -) - - -class AssistantFnc(agents.llm.FunctionContext): - @agents.llm.ai_callable( - desc="Called when asked to evaluate something that would require vision capabilities." - ) - async def image( - self, - user_msg: Annotated[ - str, - agents.llm.TypeInfo(desc="The user message that triggered this function"), - ], - ): - ctx = AssistantContext.get_current() - ctx.store_metadata("user_msg", user_msg) - - -async def get_human_video_track(room: rtc.Room): - track_future = asyncio.Future[rtc.RemoteVideoTrack]() - - def on_sub(track: rtc.Track, *_): - if isinstance(track, rtc.RemoteVideoTrack): - track_future.set_result(track) - - room.on("track_subscribed", on_sub) - - remote_video_tracks: List[rtc.RemoteVideoTrack] = [] - for _, p in room.participants.items(): - for _, t_pub in p.tracks.items(): - if t_pub.track is not None and isinstance( - t_pub.track, rtc.RemoteVideoTrack - ): - remote_video_tracks.append(t_pub.track) - - if len(remote_video_tracks) > 0: - track_future.set_result(remote_video_tracks[0]) - - video_track = await track_future - room.off("track_subscribed", on_sub) - return video_track - - -async def entrypoint(ctx: JobContext): - sip = ctx.room.name.startswith("sip") - initial_ctx = ChatContext( - messages=[ - ChatMessage( - role=ChatRole.SYSTEM, - text=( - "You are a funny bot created by LiveKit. Your interface with users will be voice. " - "You should use short and concise responses, and avoiding usage of unpronouncable punctuation." - ), - ) - ] - ) - - gpt = openai.LLM(model="gpt-4o") - latest_image: rtc.VideoFrame | None = None - img_msg_queue: deque[agents.llm.ChatMessage] = deque() - assistant = VoiceAssistant( - vad=silero.VAD(), - stt=deepgram.STT(), - llm=gpt, - tts=elevenlabs.TTS(encoding="pcm_44100"), - fnc_ctx=None if sip else AssistantFnc(), - chat_ctx=initial_ctx, - ) - - chat = rtc.ChatManager(ctx.room) - - async def _answer_from_text(text: str): - chat_ctx = copy.deepcopy(assistant.chat_context) - chat_ctx.messages.append(ChatMessage(role=ChatRole.USER, text=text)) - - stream = await gpt.chat(chat_ctx) - await assistant.say(stream) - - @chat.on("message_received") - def on_chat_received(msg: rtc.ChatMessage): - if not msg.message: - return - - asyncio.create_task(_answer_from_text(msg.message)) - - async def respond_to_image(user_msg: str): - nonlocal latest_image, img_msg_queue, initial_ctx - if not latest_image: - await assistant.say(NO_IMAGE_MESSAGE_GENERIC) - return - - initial_ctx.messages.append( - agents.llm.ChatMessage( - role=agents.llm.ChatRole.USER, - text=user_msg, - images=[agents.llm.ChatImage(image=latest_image)], - ) - ) - img_msg_queue.append(initial_ctx.messages[-1]) - if len(img_msg_queue) >= MAX_IMAGES: - msg = img_msg_queue.popleft() - msg.images = [] - - stream = await gpt.chat(initial_ctx) - await assistant.say(stream, allow_interruptions=True) - - @assistant.on("function_calls_finished") - def _function_calls_done(ctx: AssistantContext): - user_msg = ctx.get_metadata("user_msg") - if not user_msg: - return - asyncio.ensure_future(respond_to_image(user_msg)) - - assistant.start(ctx.room) - - await asyncio.sleep(0.5) - await assistant.say("Hey, how can I help you today?", allow_interruptions=True) - while ctx.room.connection_state == rtc.ConnectionState.CONN_CONNECTED: - video_track = await get_human_video_track(ctx.room) - async for event in rtc.VideoStream(video_track): - latest_image = event.frame - - -async def request_fnc(req: JobRequest) -> None: - logging.info("received request %s", req) - await req.accept(entrypoint) - - -if __name__ == "__main__": - cli.run_app(WorkerOptions(request_fnc)) diff --git a/examples/_deployed/kitt/requirements.txt b/examples/_deployed/kitt/requirements.txt deleted file mode 100644 index 5bfe42247..000000000 --- a/examples/_deployed/kitt/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -livekit-agents>=0.8.2 -livekit-plugins-openai>=0.7.1 -livekit-plugins-deepgram>=0.6.1 -livekit-plugins-elevenlabs>=0.7.1 -livekit-plugins-silero>=0.6.1 - From 0c7addbe0cc90ae03f98554db4a7afe18329ed6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Wed, 31 Jul 2024 02:00:55 +0200 Subject: [PATCH 03/24] deepgram: segment audio frames into 200ms intervals before sending to the websocket (#549) --- .../livekit/plugins/deepgram/stt.py | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py index 9144ca90b..e71ac09b7 100644 --- a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py +++ b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py @@ -25,7 +25,6 @@ from urllib.parse import urlencode import aiohttp -from livekit import rtc from livekit.agents import stt, utils from livekit.agents.utils import AudioBuffer, merge_frames @@ -256,23 +255,27 @@ async def keepalive_task(): async def send_task(): nonlocal closing_ws - # forward inputs to deepgram - # if we receive a close message, signal it to deepgram and break. - # the recv task will then make sure to process the remaining audio and stop + + # forward audio to deepgram in chunks of 200ms + ms200 = self._opts.sample_rate // 5 * self._opts.num_channels + audio_bstream = utils.audio.AudioByteStream( + sample_rate=self._opts.sample_rate, + num_channels=self._opts.num_channels, + samples_per_frame=ms200, + ) + async for data in self._input_ch: - if isinstance(data, rtc.AudioFrame): - # TODO(theomonnom): The remix_and_resample method is low quality - # and should be replaced with a continuous resampling - frame = data.remix_and_resample( - self._opts.sample_rate, self._opts.num_channels - ) + if isinstance(data, self._FlushSentinel): + frames = audio_bstream.flush() + else: + frames = audio_bstream.write(data.data) + for frame in frames: await ws.send_bytes(frame.data.tobytes()) + # tell deepgram we are done sending audio/inputs closing_ws = True - await ws.send_str( - SpeechStream._CLOSE_MSG - ) # tell deepgram we are done with inputs + await ws.send_str(SpeechStream._CLOSE_MSG) async def recv_task(): nonlocal closing_ws From fdd6b3a36ab43cc6fd9b11087d7ad69c9123dcee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Wed, 31 Jul 2024 14:22:05 +0200 Subject: [PATCH 04/24] silero: optimize numpy input buffers (#550) --- .changeset/hot-cows-wash.md | 5 + livekit-plugins/install_plugins_editable.sh | 2 +- .../livekit/plugins/silero/onnx_model.py | 39 +- .../livekit/plugins/silero/vad.py | 334 +++++++++--------- tests/test_stt.py | 2 +- 5 files changed, 188 insertions(+), 194 deletions(-) create mode 100644 .changeset/hot-cows-wash.md diff --git a/.changeset/hot-cows-wash.md b/.changeset/hot-cows-wash.md new file mode 100644 index 000000000..04ed627c0 --- /dev/null +++ b/.changeset/hot-cows-wash.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-silero": patch +--- + +silero: optimize numpy input buffers diff --git a/livekit-plugins/install_plugins_editable.sh b/livekit-plugins/install_plugins_editable.sh index 4af249a26..a2b66100a 100755 --- a/livekit-plugins/install_plugins_editable.sh +++ b/livekit-plugins/install_plugins_editable.sh @@ -15,7 +15,7 @@ pip install -e ./livekit-plugins-deepgram --config-settings editable_mode=strict pip install -e ./livekit-plugins-elevenlabs --config-settings editable_mode=strict pip install -e ./livekit-plugins-google --config-settings editable_mode=strict pip install -e ./livekit-plugins-minimal --config-settings editable_mode=strict -pip install -e ./livekit-plugins-nklt --config-settings editable_mode=strict +pip install -e ./livekit-plugins-nltk --config-settings editable_mode=strict pip install -e ./livekit-plugins-openai --config-settings editable_mode=strict pip install -e ./livekit-plugins-rag --config-settings editable_mode=strict pip install -e ./livekit-plugins-silero --config-settings editable_mode=strict diff --git a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/onnx_model.py b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/onnx_model.py index 8af75bd26..b0f24e564 100644 --- a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/onnx_model.py +++ b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/onnx_model.py @@ -18,7 +18,7 @@ def new_inference_session(force_cpu: bool) -> onnxruntime.InferenceSession: / "silero_vad.onnx" ) ctx = importlib.resources.as_file(res) - path = _resource_files.enter_context(ctx) + path = str(_resource_files.enter_context(ctx)) opts = onnxruntime.SessionOptions() opts.inter_op_num_threads = 1 @@ -27,10 +27,10 @@ def new_inference_session(force_cpu: bool) -> onnxruntime.InferenceSession: if force_cpu and "CPUExecutionProvider" in onnxruntime.get_available_providers(): session = onnxruntime.InferenceSession( - str(path), providers=["CPUExecutionProvider"], ess_options=opts + path, providers=["CPUExecutionProvider"], ess_options=opts ) else: - session = onnxruntime.InferenceSession(str(path), sess_options=opts) + session = onnxruntime.InferenceSession(path, sess_options=opts) return session @@ -52,7 +52,16 @@ def __init__( self._window_size_samples = 512 self._context_size = 64 - self.reset_states() + self._sample_rate_nd = np.array(sample_rate, dtype=np.int64) + self._context = np.zeros((1, self._context_size), dtype=np.float32) + self._rnn_state = np.zeros((2, 1, 128), dtype=np.float32) + self._input_buffer = np.zeros( + (1, self._context_size + self._window_size_samples), dtype=np.float32 + ) + + @property + def sample_rate(self) -> int: + return self._sample_rate @property def window_size_samples(self) -> int: @@ -62,25 +71,15 @@ def window_size_samples(self) -> int: def context_size(self) -> int: return self._context_size - def reset_states(self) -> None: - self._state = np.zeros((2, 1, 128), dtype=np.float32) - self._context = np.zeros((1, self._context_size), dtype=np.float32) - def __call__(self, x: np.ndarray) -> float: - if x.ndim == 1: - x = np.expand_dims(x, axis=0) - - if x.shape[1] != self._window_size_samples: - raise ValueError( - f"Input shape must be (N, {self._window_size_samples}), got {x.shape}" - ) + self._input_buffer[:, : self._context_size] = self._context + self._input_buffer[:, self._context_size :] = x - x = np.concatenate([self._context, x], axis=1) ort_inputs = { - "input": x, - "state": self._state, - "sr": np.array(self._sample_rate, dtype=np.int64), + "input": self._input_buffer, + "state": self._rnn_state, + "sr": self._sample_rate_nd, } out, self._state = self._sess.run(None, ort_inputs) - self._context = x[..., -self._context_size :] + self._context = self._input_buffer[:, -self._context_size :] return out.item() diff --git a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py index 283ad4926..7673a9828 100644 --- a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py +++ b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py @@ -16,12 +16,12 @@ import asyncio import time +from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass import numpy as np import onnxruntime # type: ignore from livekit import agents, rtc -from livekit.agents import utils from . import onnx_model from .log import logger @@ -96,206 +96,196 @@ def stream(self) -> "VADStream": ) -@dataclass -class _WindowData: - inference_data: np.ndarray - # data returned to the user are the original frames (int16) - original_data: np.ndarray - - class VADStream(agents.vad.VADStream): def __init__(self, opts: _VADOptions, model: onnx_model.OnnxModel) -> None: super().__init__() self._opts, self._model = opts, model - self._original_sample_rate: int | None = None - self._window_data: _WindowData | None = None - self._remaining_samples = model.window_size_samples + self._loop = asyncio.get_event_loop() + + self._executor = ThreadPoolExecutor(max_workers=1) + self._task.add_done_callback(lambda _: self._executor.shutdown(wait=False)) @agents.utils.log_exceptions(logger=logger) async def _main_task(self): - window_ch = utils.aio.Chan[_WindowData]() - await asyncio.gather( - self._run_inference(window_ch), self._forward_input(window_ch) + og_sample_rate = 0 + og_needed_samples = 0 # needed samples to complete the window data + og_window_size_samples = 0 # size in samples of og_window_data + og_window_data: np.ndarray | None = None + + index_step = 0 + inference_window_data = np.empty( + self._model.window_size_samples, dtype=np.float32 ) - async def _forward_input(self, window_tx: utils.aio.ChanSender[_WindowData]): - """ - Push frame to the VAD stream for processing. - The frames are split into chunks of the given window size and processed. - (Buffered if the window size is not reached yet) - """ + # a copy is exposed to the user in END_OF_SPEECH + speech_buffer: np.ndarray | None = None + speech_buffer_index: int = 0 + + # "pub_" means public, these values are exposed to the users through events + pub_speaking = False + pub_speech_duration = 0.0 + pub_silence_duration = 0.0 + pub_current_sample = 0 + + speech_threshold_duration = 0.0 + silence_threshold_duration = 0.0 + async for frame in self._input_ch: if not isinstance(frame, rtc.AudioFrame): - continue + continue # ignore flush sentinel for now if frame.sample_rate != 8000 and frame.sample_rate % 16000 != 0: logger.error("only 8KHz and 16KHz*X sample rates are supported") continue - - if ( - self._original_sample_rate is not None - and self._original_sample_rate != frame.sample_rate - ): - raise ValueError("a frame with another sample rate was already pushed") - - self._original_sample_rate = frame.sample_rate - step = frame.sample_rate // 16000 - - if self._window_data is None: - self._window_data = _WindowData( - inference_data=np.zeros( - self._model.window_size_samples, dtype=np.float32 - ), - original_data=np.zeros( - self._model.window_size_samples * step, dtype=np.int16 - ), + elif og_window_data is None: + # alloc the og buffers now that we know the pushed sample rate + og_sample_rate = frame.sample_rate + og_window_size_samples = int( + (self._model.window_size_samples / self._model.sample_rate) + * og_sample_rate ) + og_window_data = np.empty(og_window_size_samples, dtype=np.int16) + og_needed_samples = og_window_size_samples + index_step = frame.sample_rate // 16000 - if frame.num_channels != 1: - raise ValueError("vad currently only supports mono audio frames") - - og_frame = np.frombuffer(frame.data, dtype=np.int16) - if_frame = og_frame[::step].astype(np.float32) / np.iinfo(np.int16).max - - remaining_data = len(if_frame) - while remaining_data > 0: - i = self._model.window_size_samples - self._remaining_samples - to_copy = min(remaining_data, self._remaining_samples) - self._remaining_samples -= to_copy - remaining_data -= to_copy - - self._window_data.original_data[ - i * step : i * step + to_copy * step - ] = og_frame[: to_copy * step] - self._window_data.inference_data[i : i + to_copy] = if_frame[:to_copy] - - if self._remaining_samples == 0: - window_tx.send_nowait(self._window_data) - self._window_data = _WindowData( - inference_data=np.zeros( - self._model.window_size_samples, dtype=np.float32 - ), - original_data=np.zeros( - self._model.window_size_samples * step, dtype=np.int16 - ), - ) - self._remaining_samples = self._model.window_size_samples - - window_tx.close() + speech_buffer = np.empty( + int(self._opts.max_buffered_speech * og_sample_rate), dtype=np.int16 + ) + elif og_sample_rate != frame.sample_rate: + logger.error("a frame with another sample rate was already pushed") + continue - async def _run_inference(self, window_rx: utils.aio.ChanReceiver[_WindowData]): - pub_speaking = False - pub_speech_duration = 0.0 - pub_silence_duration = 0.0 - pub_speech_buf = np.array([], dtype=np.int16) + frame_data = np.frombuffer(frame.data, dtype=np.int16) + remaining_samples = len(frame_data) + while remaining_samples > 0: + to_copy = min(remaining_samples, og_needed_samples) - may_start_at_sample = -1 - may_end_at_sample = -1 + index = len(og_window_data) - og_needed_samples + og_window_data[index : index + to_copy] = frame_data[:to_copy] - min_speech_samples = int( - self._opts.min_speech_duration * self._opts.sample_rate - ) - min_silence_samples = int( - self._opts.min_silence_duration * self._opts.sample_rate - ) + remaining_samples -= to_copy + og_needed_samples -= to_copy - current_sample = 0 + if og_needed_samples != 0: + continue - async for window_data in window_rx: - inference_data = window_data.inference_data - start_time = time.time() - raw_prob = await asyncio.to_thread(lambda: self._model(inference_data)) - inference_duration = time.time() - start_time + og_needed_samples = og_window_size_samples - window_duration = self._model.window_size_samples / self._opts.sample_rate - if inference_duration > window_duration: - # slower than realtime - logger.warning( - "vad inference took too long - slower than realtime: %f", - inference_duration, + # copy the data to the inference buffer by sampling at each index_step & convert to float + np.divide( + og_window_data[::index_step], + np.iinfo(np.int16).max, + out=inference_window_data, + dtype=np.float32, ) - # append new data to current speech buffer - pub_speech_buf = np.append(pub_speech_buf, window_data.original_data) - max_data_s = self._opts.padding_duration - if not pub_speaking: - max_data_s += self._opts.min_speech_duration - else: - max_data_s += self._opts.max_buffered_speech - - assert self._original_sample_rate is not None - cl = int(max_data_s) * self._original_sample_rate - if len(pub_speech_buf) > cl: - pub_speech_buf = pub_speech_buf[-cl:] - - # dispatch start/end when needed - if raw_prob >= self._opts.activation_threshold: - may_end_at_sample = -1 - - if may_start_at_sample == -1: - may_start_at_sample = current_sample + min_speech_samples - - if may_start_at_sample <= current_sample and not pub_speaking: - pub_speaking = True - self._event_ch.send_nowait( - agents.vad.VADEvent( - type=agents.vad.VADEventType.START_OF_SPEECH, - silence_duration=pub_silence_duration, - speech_duration=0.0, - samples_index=current_sample, - speaking=True, - ) - ) - - pub_silence_duration = 0 - pub_speech_duration += self._opts.min_speech_duration - - if pub_speaking: - pub_speech_duration += window_duration - else: - pub_silence_duration = 0 - - self._event_ch.send_nowait( - agents.vad.VADEvent( - type=agents.vad.VADEventType.INFERENCE_DONE, - samples_index=current_sample, - silence_duration=0.0, - speech_duration=pub_speech_duration, - probability=raw_prob, - inference_duration=inference_duration, - speaking=pub_speaking, + # run the inference + start_time = time.time() + raw_prob = await self._loop.run_in_executor( + self._executor, self._model, inference_window_data ) - ) - - if raw_prob < self._opts.activation_threshold: - may_start_at_sample = -1 - - if may_end_at_sample == -1: - may_end_at_sample = current_sample + min_silence_samples + inference_duration = time.time() - start_time + window_duration = ( + self._model.window_size_samples / self._opts.sample_rate + ) + if inference_duration > window_duration: + logger.warning( + "vad inference took too long - slower than realtime: %f", + inference_duration, + ) - if may_end_at_sample <= current_sample and pub_speaking: - pub_speaking = False + pub_current_sample += og_window_size_samples - frame = rtc.AudioFrame( - sample_rate=self._original_sample_rate, - num_channels=1, - samples_per_channel=len(pub_speech_buf), - data=pub_speech_buf.tobytes(), + def _copy_window(): + nonlocal speech_buffer_index + to_copy = min( + og_window_size_samples, + len(speech_buffer) - speech_buffer_index, ) - - self._event_ch.send_nowait( - agents.vad.VADEvent( - type=agents.vad.VADEventType.END_OF_SPEECH, - samples_index=current_sample, - silence_duration=0.0, - speech_duration=pub_speech_duration, - frames=[frame], - speaking=False, - ) + if to_copy <= 0: + # max_buffered_speech reached + return + + speech_buffer[ + speech_buffer_index : speech_buffer_index + to_copy + ] = og_window_data + speech_buffer_index += og_window_size_samples + + if pub_speaking: + pub_speech_duration += window_duration + _copy_window() + else: + pub_silence_duration += window_duration + + self._event_ch.send_nowait( + agents.vad.VADEvent( + type=agents.vad.VADEventType.INFERENCE_DONE, + samples_index=pub_current_sample, + silence_duration=pub_silence_duration, + speech_duration=pub_speech_duration, + probability=raw_prob, + inference_duration=inference_duration, + speaking=pub_speaking, ) + ) - pub_speech_buf = np.array([], dtype=np.int16) - pub_speech_duration = 0 - pub_silence_duration += self._opts.min_silence_duration + if raw_prob >= self._opts.activation_threshold: + speech_threshold_duration += window_duration + silence_threshold_duration = 0.0 + + if not pub_speaking: + _copy_window() + + if speech_threshold_duration >= self._opts.min_speech_duration: + pub_speaking = True + pub_silence_duration = 0.0 + pub_speech_duration = speech_threshold_duration + + self._event_ch.send_nowait( + agents.vad.VADEvent( + type=agents.vad.VADEventType.START_OF_SPEECH, + samples_index=pub_current_sample, + silence_duration=pub_silence_duration, + speech_duration=pub_speech_duration, + speaking=True, + ) + ) + else: + silence_threshold_duration += window_duration + speech_threshold_duration = 0.0 + + if not pub_speaking: + speech_buffer_index = 0 + + if ( + pub_speaking + and silence_threshold_duration + >= self._opts.min_silence_duration + ): + pub_speaking = False + pub_speech_duration = 0.0 + pub_silence_duration = silence_threshold_duration + + speech_data = speech_buffer[ + :speech_buffer_index + ].tobytes() # copy the data from speech_buffer + + self._event_ch.send_nowait( + agents.vad.VADEvent( + type=agents.vad.VADEventType.END_OF_SPEECH, + samples_index=pub_current_sample, + silence_duration=pub_silence_duration, + speech_duration=pub_speech_duration, + frames=[ + rtc.AudioFrame( + sample_rate=og_sample_rate, + num_channels=1, + samples_per_channel=speech_buffer_index, + data=speech_data, + ) + ], + speaking=False, + ) + ) - current_sample += self._model.window_size_samples + speech_buffer_index = 0 diff --git a/tests/test_stt.py b/tests/test_stt.py index d3b55b3ea..3772a1976 100644 --- a/tests/test_stt.py +++ b/tests/test_stt.py @@ -53,7 +53,7 @@ async def test_recognize(stt: agents.stt.STT): assert event.type == agents.stt.SpeechEventType.FINAL_TRANSCRIPT -STREAM_VAD = silero.VAD.load() +STREAM_VAD = silero.VAD.load(min_silence_duration=0.75) STREAM_STT = [ deepgram.STT(), google.STT(), From 325b06920314b367ecc3df26c7564076f17147b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Wed, 31 Jul 2024 14:24:32 +0200 Subject: [PATCH 05/24] voiceassistant: interrupt on final transcript (#546) --- .changeset/six-badgers-sneeze.md | 5 +++++ .../livekit/agents/voice_assistant/voice_assistant.py | 8 ++++++++ 2 files changed, 13 insertions(+) create mode 100644 .changeset/six-badgers-sneeze.md diff --git a/.changeset/six-badgers-sneeze.md b/.changeset/six-badgers-sneeze.md new file mode 100644 index 000000000..331bc63fe --- /dev/null +++ b/.changeset/six-badgers-sneeze.md @@ -0,0 +1,5 @@ +--- +"livekit-agents": patch +--- + +voiceassistant: interrupt on final transcript diff --git a/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py b/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py index 6f4e9fdb0..bea2fcf58 100644 --- a/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py +++ b/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py @@ -381,6 +381,14 @@ def _on_final_transcript(ev: stt.SpeechEvent) -> None: self._deferred_validation.on_human_final_transcript(ev.alternatives[0].text) + words = self._opts.transcription.word_tokenizer.tokenize( + text=ev.alternatives[0].text + ) + if len(words) >= 3: + # VAD can sometimes not detect that the human is speaking + # to make the interruption more reliable, we also interrupt on the final transcript. + self._interrupt_if_possible() + self._human_input.on("start_of_speech", _on_start_of_speech) self._human_input.on("vad_inference_done", _on_vad_updated) self._human_input.on("end_of_speech", _on_end_of_speech) From ddfbb3582e08590ba5d0ff1a547dda6a7eb1d002 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Wed, 31 Jul 2024 14:27:35 +0200 Subject: [PATCH 06/24] voiceassistant: commit user question directly when allow_interruptions=False (#547) --- .changeset/pretty-rivers-care.md | 5 +++++ .../agents/voice_assistant/voice_assistant.py | 14 ++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) create mode 100644 .changeset/pretty-rivers-care.md diff --git a/.changeset/pretty-rivers-care.md b/.changeset/pretty-rivers-care.md new file mode 100644 index 000000000..1c7cf200b --- /dev/null +++ b/.changeset/pretty-rivers-care.md @@ -0,0 +1,5 @@ +--- +"livekit-agents": patch +--- + +voiceassistant: commit user question directly when allow_interruptions=False diff --git a/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py b/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py index bea2fcf58..93b87cd66 100644 --- a/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py +++ b/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py @@ -499,7 +499,6 @@ async def _play_speech(self, speech_info: _SpeechInfo) -> None: user_speech_commited = False play_handle = synthesis_handle.play() - join_fut = play_handle.join() def _commit_user_question_if_needed() -> None: nonlocal user_speech_commited @@ -518,9 +517,13 @@ def _commit_user_question_if_needed() -> None: # make sure at least some speech was played before committing the user message # since we try to validate as fast as possible it is possible the agent gets interrupted # really quickly (barely audible), we don't want to mark this question as "answered". - if not is_using_tools and ( - play_handle.time_played < self.MIN_TIME_PLAYED_FOR_COMMIT - and not join_fut.done() + if ( + speech_info.allow_interruptions + and not is_using_tools + and ( + play_handle.time_played < self.MIN_TIME_PLAYED_FOR_COMMIT + and not join_fut.done() + ) ): return @@ -532,6 +535,9 @@ def _commit_user_question_if_needed() -> None: user_speech_commited = True # wait for the play_handle to finish and check every 1s if the user question should be committed + _commit_user_question_if_needed() + + join_fut = play_handle.join() while not join_fut.done(): await asyncio.wait( [join_fut], return_when=asyncio.FIRST_COMPLETED, timeout=1.0 From 86f76d6e96bf56ae652aadb9f851c746f73653c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Wed, 31 Jul 2024 14:29:02 +0200 Subject: [PATCH 07/24] voiceassistant: fix duplicate answers (#548) --- .changeset/wicked-rats-exist.md | 5 +++++ .../livekit/agents/voice_assistant/voice_assistant.py | 6 ++++++ 2 files changed, 11 insertions(+) create mode 100644 .changeset/wicked-rats-exist.md diff --git a/.changeset/wicked-rats-exist.md b/.changeset/wicked-rats-exist.md new file mode 100644 index 000000000..bf9c6fe7c --- /dev/null +++ b/.changeset/wicked-rats-exist.md @@ -0,0 +1,5 @@ +--- +"livekit-agents": patch +--- + +voiceassistant: fix duplicate answers diff --git a/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py b/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py index 93b87cd66..5e9958bd8 100644 --- a/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py +++ b/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py @@ -644,6 +644,12 @@ def _validate_reply_if_possible(self) -> None: self._pending_agent_reply is not None and not self._pending_agent_reply.synthesis_handle.interrupted ): + # in some timing, we could end up with two pushed agent replies inside the speech queue. + # so make sure we directly interrupt every reply when pushing a new one + for speech in self._speech_q: + if speech.allow_interruptions and speech.is_reply: + speech.synthesis_handle.interrupt() + self._add_speech_for_playout(self._pending_agent_reply) self._pending_agent_reply = None elif not self._opts.preemptive_synthesis and self._transcribed_text: From ab466d587144e27c791459eddab7b90f14da3b23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Wed, 31 Jul 2024 14:51:32 +0200 Subject: [PATCH 08/24] forgot to add changesets (#553) Co-authored-by: lukasIO --- .changeset/many-clocks-punch.md | 5 +++++ .changeset/perfect-doors-play.md | 5 +++++ 2 files changed, 10 insertions(+) create mode 100644 .changeset/many-clocks-punch.md create mode 100644 .changeset/perfect-doors-play.md diff --git a/.changeset/many-clocks-punch.md b/.changeset/many-clocks-punch.md new file mode 100644 index 000000000..fa3125796 --- /dev/null +++ b/.changeset/many-clocks-punch.md @@ -0,0 +1,5 @@ +--- +"livekit-agents": patch +--- + +voiceassistant: cleanup validation behaviour #545 diff --git a/.changeset/perfect-doors-play.md b/.changeset/perfect-doors-play.md new file mode 100644 index 000000000..9479814e0 --- /dev/null +++ b/.changeset/perfect-doors-play.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-deepgram": patch +--- + +deepgram: segment audio frames into 200ms intervals before sending to the websocket #549 From 156f9a81979ff44b15506c012db5ca2eb14ea122 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Wed, 31 Jul 2024 22:53:33 +0200 Subject: [PATCH 09/24] voiceassistant: run function calls sequentially (#554) --- .changeset/angry-books-eat.md | 5 +++++ livekit-agents/livekit/agents/llm/llm.py | 8 +++----- .../agents/voice_assistant/voice_assistant.py | 14 +++++++++++--- 3 files changed, 19 insertions(+), 8 deletions(-) create mode 100644 .changeset/angry-books-eat.md diff --git a/.changeset/angry-books-eat.md b/.changeset/angry-books-eat.md new file mode 100644 index 000000000..8c09f6c2e --- /dev/null +++ b/.changeset/angry-books-eat.md @@ -0,0 +1,5 @@ +--- +"livekit-agents": patch +--- + +voiceassistant: run function calls sequentially diff --git a/livekit-agents/livekit/agents/llm/llm.py b/livekit-agents/livekit/agents/llm/llm.py index df85b1248..f947ecd22 100644 --- a/livekit-agents/livekit/agents/llm/llm.py +++ b/livekit-agents/livekit/agents/llm/llm.py @@ -5,6 +5,7 @@ from dataclasses import dataclass, field from typing import Any, AsyncIterator +from .. import utils from . import function_context from .chat_context import ChatContext, ChatRole @@ -65,7 +66,7 @@ def fnc_ctx(self) -> function_context.FunctionContext | None: return self._fnc_ctx def execute_functions(self) -> list[function_context.CalledFunction]: - """Run all functions in this stream.""" + """Execute all functions concurrently of this stream.""" called_functions: list[function_context.CalledFunction] = [] for fnc_info in self._function_calls_info: called_fnc = fnc_info.execute() @@ -76,10 +77,7 @@ def execute_functions(self) -> list[function_context.CalledFunction]: return called_functions async def aclose(self) -> None: - for task in self._tasks: - task.cancel() - - await asyncio.gather(*self._tasks, return_exceptions=True) + await utils.aio.gracefully_cancel(*self._tasks) def __aiter__(self) -> AsyncIterator[ChatChunk]: return self diff --git a/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py b/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py index 5e9958bd8..f342a88ce 100644 --- a/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py +++ b/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py @@ -567,9 +567,17 @@ def _commit_user_question_if_needed() -> None: call_ctx = AssistantCallContext(self, speech_info.source) tk = _CallContextVar.set(call_ctx) self.emit("function_calls_collected", speech_info.source.function_calls) - called_fncs = speech_info.source.execute_functions() - tasks = [called_fnc.task for called_fnc in called_fncs] - await asyncio.gather(*tasks, return_exceptions=True) + called_fncs_info = speech_info.source.function_calls + + called_fncs = [] + for fnc in called_fncs_info: + called_fnc = fnc.execute() + called_fncs.append(called_fnc) + try: + await called_fnc.task + except Exception: + pass + self.emit("function_calls_finished", called_fncs) _CallContextVar.reset(tk) From ad3c05ab3c7d158ac499885d4e385cc72a5afa54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Wed, 31 Jul 2024 22:53:44 +0200 Subject: [PATCH 10/24] ipc: increase high ping threshold (#556) --- .changeset/purple-garlics-approve.md | 5 +++++ livekit-agents/livekit/agents/ipc/proto.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 .changeset/purple-garlics-approve.md diff --git a/.changeset/purple-garlics-approve.md b/.changeset/purple-garlics-approve.md new file mode 100644 index 000000000..a8aaaca09 --- /dev/null +++ b/.changeset/purple-garlics-approve.md @@ -0,0 +1,5 @@ +--- +"livekit-agents": patch +--- + +ipc: increase high ping threshold diff --git a/livekit-agents/livekit/agents/ipc/proto.py b/livekit-agents/livekit/agents/ipc/proto.py index f1133698c..4a3a7a759 100644 --- a/livekit-agents/livekit/agents/ipc/proto.py +++ b/livekit-agents/livekit/agents/ipc/proto.py @@ -12,7 +12,7 @@ PING_INTERVAL = 5 PING_TIMEOUT = 90 -HIGH_PING_THRESHOLD = 0.02 # 20ms +HIGH_PING_THRESHOLD = 0.15 # 150ms @dataclass From 833115d0a18c3febbb7959bf45e439b42120d84f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Wed, 31 Jul 2024 22:55:21 +0200 Subject: [PATCH 11/24] configure plugins loggers & more debug logs on the voiceassistant (#555) --- .changeset/eleven-kings-push.md | 5 ++ livekit-agents/livekit/agents/cli/log.py | 25 +++++++- livekit-agents/livekit/agents/plugin.py | 39 +++++++++--- .../agents/voice_assistant/agent_output.py | 58 +++++++++++++----- ...cancellable_source.py => agent_playout.py} | 26 ++++++-- .../agents/voice_assistant/voice_assistant.py | 61 +++++++++++++++---- .../livekit/plugins/azure/__init__.py | 7 +-- .../livekit/plugins/cartesia/__init__.py | 7 +-- .../livekit/plugins/deepgram/__init__.py | 7 +-- .../livekit/plugins/elevenlabs/__init__.py | 7 +-- .../livekit/plugins/google/__init__.py | 7 +-- .../livekit/plugins/minimal/__init__.py | 6 +- .../livekit/plugins/minimal/log.py | 3 + .../livekit/plugins/nltk/__init__.py | 4 +- .../livekit/plugins/openai/__init__.py | 7 +-- .../livekit/plugins/rag/__init__.py | 7 +-- .../livekit/plugins/silero/__init__.py | 7 +-- 17 files changed, 204 insertions(+), 79 deletions(-) create mode 100644 .changeset/eleven-kings-push.md rename livekit-agents/livekit/agents/voice_assistant/{cancellable_source.py => agent_playout.py} (87%) create mode 100644 livekit-plugins/livekit-plugins-minimal/livekit/plugins/minimal/log.py diff --git a/.changeset/eleven-kings-push.md b/.changeset/eleven-kings-push.md new file mode 100644 index 000000000..332b38e99 --- /dev/null +++ b/.changeset/eleven-kings-push.md @@ -0,0 +1,5 @@ +--- +"livekit-agents": patch +--- + +configure plugins loggers & more debug logs on the voiceassistant diff --git a/livekit-agents/livekit/agents/cli/log.py b/livekit-agents/livekit/agents/cli/log.py index 81419bd40..e4fef173a 100644 --- a/livekit-agents/livekit/agents/cli/log.py +++ b/livekit-agents/livekit/agents/cli/log.py @@ -9,7 +9,7 @@ from inspect import istraceback from typing import Any, Dict, Tuple -from ..log import logger +from ..plugin import Plugin # skip default LogRecord attributes # http://docs.python.org/library/logging.html#logrecord-attributes @@ -196,6 +196,25 @@ def setup_logging(log_level: str, production: bool = True) -> None: root = logging.getLogger() root.addHandler(handler) - root.setLevel(logging.WARN) - logger.setLevel(log_level) + if root.level == logging.NOTSET: + root.setLevel(logging.WARN) + + from ..log import logger + + if logger.level == logging.NOTSET: + logger.setLevel(log_level) + + from ..voice_assistant.log import logger + + if logger.level == logging.NOTSET: + logger.setLevel(log_level) + + def _configure_plugin_logger(plugin: Plugin) -> None: + if plugin.logger is not None and plugin.logger.level == logging.NOTSET: + plugin.logger.setLevel(log_level) + + for plugin in Plugin.registered_plugins: + _configure_plugin_logger(plugin) + + Plugin.emitter.on("plugin_registered", _configure_plugin_logger) diff --git a/livekit-agents/livekit/agents/plugin.py b/livekit-agents/livekit/agents/plugin.py index 2819d1e5e..3554fc337 100644 --- a/livekit-agents/livekit/agents/plugin.py +++ b/livekit-agents/livekit/agents/plugin.py @@ -1,22 +1,43 @@ -from abc import ABC, abstractmethod -from typing import List +from __future__ import annotations + +import logging +import threading +from abc import ABC +from typing import List, Literal + +from . import utils + +EventTypes = Literal["plugin_registered",] class Plugin(ABC): registered_plugins: List["Plugin"] = [] - - def __init__(self, title: str, version: str, package: str) -> None: + emitter: utils.EventEmitter[EventTypes] = utils.EventEmitter() + lock = threading.Lock() + + # TODO(theomonnom): make logger mandatory once all plugins have been updated + def __init__( + self, + title: str, + version: str, + package: str, + logger: logging.Logger | None = None, + ) -> None: self._title = title self._version = version self._package = package + self._logger = logger @classmethod def register_plugin(cls, plugin: "Plugin") -> None: + if threading.current_thread() != threading.main_thread(): + raise RuntimeError("Plugins must be registered on the main thread") + cls.registered_plugins.append(plugin) + cls.emitter.emit("plugin_registered", plugin) - @abstractmethod - def download_files(self) -> None: - pass + # plugin can implement an optional download_files method + def download_files(self) -> None: ... @property def package(self) -> str: @@ -29,3 +50,7 @@ def title(self) -> str: @property def version(self) -> str: return self._version + + @property + def logger(self) -> logging.Logger | None: + return self._logger diff --git a/livekit-agents/livekit/agents/voice_assistant/agent_output.py b/livekit-agents/livekit/agents/voice_assistant/agent_output.py index 399d0e74f..183641311 100644 --- a/livekit-agents/livekit/agents/voice_assistant/agent_output.py +++ b/livekit-agents/livekit/agents/voice_assistant/agent_output.py @@ -9,7 +9,7 @@ from .. import llm, tokenize, utils from .. import transcription as agent_transcription from .. import tts as text_to_speech -from .cancellable_source import CancellableAudioSource, PlayoutHandle +from .agent_playout import AgentPlayout, PlayoutHandle from .log import logger SpeechSource = Union[AsyncIterable[str], str] @@ -19,14 +19,15 @@ class SynthesisHandle: def __init__( self, *, + speech_id: str, speech_source: SpeechSource, - audio_source: CancellableAudioSource, + agent_playout: AgentPlayout, tts: text_to_speech.TTS, transcription_fwd: agent_transcription.TTSSegmentsForwarder | None = None, ) -> None: - self._speech_source, self._audio_source, self._tts, self._tr_fwd = ( + self._speech_source, self._agent_playout, self._tts, self._tr_fwd = ( speech_source, - audio_source, + agent_playout, tts, transcription_fwd, ) @@ -34,6 +35,11 @@ def __init__( self._play_handle: PlayoutHandle | None = None self._interrupt_fut = asyncio.Future[None]() self._collected_text = "" # collected text from the async stream + self._speech_id = speech_id + + @property + def speech_id(self) -> str: + return self._speech_id @property def validated(self) -> bool: @@ -56,8 +62,8 @@ def play(self) -> PlayoutHandle: if self.interrupted: raise RuntimeError("synthesis was interrupted") - self._play_handle = self._audio_source.play( - self._buf_ch, transcription_fwd=self._tr_fwd + self._play_handle = self._agent_playout.play( + self._speech_id, self._buf_ch, transcription_fwd=self._tr_fwd ) return self._play_handle @@ -66,6 +72,11 @@ def interrupt(self) -> None: if self.interrupted: return + logger.debug( + "SynthesisHandle.interrupt: interrupting synthesis", + extra={"speech_id": self.speech_id}, + ) + if self._play_handle is not None: self._play_handle.interrupt() @@ -77,16 +88,21 @@ def __init__( self, *, room: rtc.Room, - source: CancellableAudioSource, + agent_playout: AgentPlayout, llm: llm.LLM, tts: text_to_speech.TTS, ) -> None: - self._room, self._source, self._llm, self._tts = room, source, llm, tts + self._room, self._agent_playout, self._llm, self._tts = ( + room, + agent_playout, + llm, + tts, + ) self._tasks = set[asyncio.Task[Any]]() @property - def audio_source(self) -> CancellableAudioSource: - return self._source + def playout(self) -> AgentPlayout: + return self._agent_playout async def aclose(self) -> None: for task in self._tasks: @@ -97,6 +113,7 @@ async def aclose(self) -> None: def synthesize( self, *, + speech_id: str, transcript: SpeechSource, transcription: bool, transcription_speed: float, @@ -117,9 +134,10 @@ def synthesize( handle = SynthesisHandle( speech_source=transcript, - audio_source=self._source, + agent_playout=self._agent_playout, tts=self._tts, transcription_fwd=transcription_fwd, + speech_id=speech_id, ) task = asyncio.create_task(self._synthesize_task(handle)) @@ -160,8 +178,13 @@ async def _str_synthesis_task(text: str, handle: SynthesisHandle) -> None: async for audio in handle._tts.synthesize(text): if first_frame: first_frame = False - dt = time.time() - start_time - logger.debug(f"AgentOutput._str_synthesis_task: TTFB in {dt:.2f}s") + logger.debug( + "AgentOutput._str_synthesis_task: received first frame of TTS", + extra={ + "speech_id": handle.speech_id, + "elapsed": time.time() - start_time, + }, + ) frame = audio.frame @@ -187,8 +210,13 @@ async def _read_generated_audio_task(): async for audio in tts_stream: if first_frame: first_frame = False - dt = time.time() - start_time - logger.debug(f"AgentOutput._stream_synthesis_task: TTFB in {dt:.2f}s") + logger.debug( + "AgentOutput._stream_synthesis_task: received first frame of TTS", + extra={ + "speech_id": handle.speech_id, + "elapsed": time.time() - start_time, + }, + ) if handle._tr_fwd and not handle._tr_fwd.closed: handle._tr_fwd.push_audio(audio.frame) diff --git a/livekit-agents/livekit/agents/voice_assistant/cancellable_source.py b/livekit-agents/livekit/agents/voice_assistant/agent_playout.py similarity index 87% rename from livekit-agents/livekit/agents/voice_assistant/cancellable_source.py rename to livekit-agents/livekit/agents/voice_assistant/agent_playout.py index 41665fab2..10aedce74 100644 --- a/livekit-agents/livekit/agents/voice_assistant/cancellable_source.py +++ b/livekit-agents/livekit/agents/voice_assistant/agent_playout.py @@ -14,6 +14,7 @@ class PlayoutHandle: def __init__( self, + speech_id: str, playout_source: AsyncIterable[rtc.AudioFrame], transcription_fwd: transcription.TTSSegmentsForwarder | None = None, ) -> None: @@ -22,6 +23,11 @@ def __init__( self._interrupted = False self._time_played = 0.0 self._done_fut = asyncio.Future[None]() + self._speech_id = speech_id + + @property + def speech_id(self) -> str: + return self._speech_id @property def interrupted(self) -> bool: @@ -44,7 +50,7 @@ def join(self) -> asyncio.Future: return self._done_fut -class CancellableAudioSource(utils.EventEmitter[EventTypes]): +class AgentPlayout(utils.EventEmitter[EventTypes]): def __init__(self, *, source: rtc.AudioSource, alpha: float = 0.95) -> None: super().__init__() self._source = source @@ -76,6 +82,7 @@ async def aclose(self) -> None: def play( self, + speech_id: str, playout_source: AsyncIterable[rtc.AudioFrame], transcription_fwd: transcription.TTSSegmentsForwarder | None = None, ) -> PlayoutHandle: @@ -83,7 +90,9 @@ def play( raise ValueError("cancellable source is closed") handle = PlayoutHandle( - playout_source=playout_source, transcription_fwd=transcription_fwd + speech_id=speech_id, + playout_source=playout_source, + transcription_fwd=transcription_fwd, ) self._playout_atask = asyncio.create_task( self._playout_task(self._playout_atask, handle) @@ -106,13 +115,16 @@ def _should_break(): if old_task is not None: await utils.aio.gracefully_cancel(old_task) - logger.debug("CancellableAudioSource._playout_task: started") - async for frame in handle._playout_source: if first_frame: if handle._tr_fwd is not None: handle._tr_fwd.segment_playout_started() + logger.debug( + "AgentPlayout: first frame started playing", + extra={"speech_id": handle.speech_id}, + ) + self.emit("playout_started") first_frame = False @@ -156,4 +168,8 @@ def _should_break(): handle._done_fut.set_result(None) if handle._tr_fwd is not None: await handle._tr_fwd.aclose() - logger.debug("CancellableAudioSource._playout_task: ended") + + logger.debug( + "AgentPlayout: done playing", + extra={"speech_id": handle.speech_id, "cancelled": cancelled}, + ) diff --git a/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py b/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py index f342a88ce..21266e62d 100644 --- a/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py +++ b/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py @@ -11,7 +11,7 @@ from .. import stt, tokenize, tts, utils, vad from ..llm import LLM, ChatContext, ChatMessage, FunctionContext, LLMStream from .agent_output import AgentOutput, SynthesisHandle -from .cancellable_source import CancellableAudioSource +from .agent_playout import AgentPlayout from .human_input import HumanInput from .log import logger from .plotter import AssistantPlotter @@ -19,6 +19,7 @@ @dataclass class _SpeechInfo: + id: str # useful to recognize a specific speech in logs source: str | LLMStream | AsyncIterable[str] allow_interruptions: bool add_to_chat_ctx: bool @@ -303,12 +304,14 @@ async def say( add_to_chat_ctx: Whether to add the speech to the chat context. """ await self._track_published_fut + speech_id = utils.shortuuid() self._add_speech_for_playout( _SpeechInfo( + id=speech_id, source=source, allow_interruptions=allow_interruptions, add_to_chat_ctx=add_to_chat_ctx, - synthesis_handle=self._synthesize_agent_speech(source), + synthesis_handle=self._synthesize_agent_speech(speech_id, source), ) ) @@ -354,9 +357,9 @@ def _on_vad_updated(ev: vad.VADEvent) -> None: tv = 1.0 if self._opts.allow_interruptions: tv = max(0.0, 1.0 - ev.probability) - self._agent_output.audio_source.target_volume = tv + self._agent_output.playout.target_volume = tv - smoothed_tv = self._agent_output.audio_source.smoothed_volume + smoothed_tv = self._agent_output.playout.smoothed_volume self._plotter.plot_value("raw_vol", tv) self._plotter.plot_value("smoothed_vol", smoothed_tv) @@ -406,10 +409,10 @@ async def _main_task(self) -> None: track, rtc.TrackPublishOptions(source=rtc.TrackSource.SOURCE_MICROPHONE) ) - cancellable_audio_source = CancellableAudioSource(source=audio_source) + agent_playout = AgentPlayout(source=audio_source) self._agent_output = AgentOutput( room=self._room, - source=cancellable_audio_source, + agent_playout=agent_playout, llm=self._llm, tts=self._tts, ) @@ -422,8 +425,8 @@ def _on_playout_stopped(cancelled: bool) -> None: self._plotter.plot_event("agent_stopped_speaking") self.emit("agent_stopped_speaking") - cancellable_audio_source.on("playout_started", _on_playout_started) - cancellable_audio_source.on("playout_stopped", _on_playout_stopped) + agent_playout.on("playout_started", _on_playout_started) + agent_playout.on("playout_stopped", _on_playout_stopped) self._track_published_fut.set_result(None) @@ -463,15 +466,26 @@ async def _synthesize_answer_task( self, chat_ctx=copied_ctx ) + speech_id = utils.shortuuid() reply = _SpeechInfo( + id=speech_id, source=llm_stream, allow_interruptions=self._opts.allow_interruptions, add_to_chat_ctx=True, - synthesis_handle=self._synthesize_agent_speech(llm_stream), + synthesis_handle=self._synthesize_agent_speech(speech_id, llm_stream), is_reply=True, user_question=user_transcript, ) + logger.debug( + "synthesizing agent reply", + extra={ + "speech_id": reply.id, + "user_transcript": user_transcript, + "validated": validated, + }, + ) + if validated: self._add_speech_for_playout(reply) else: @@ -493,7 +507,7 @@ async def _play_speech(self, speech_info: _SpeechInfo) -> None: if synthesis_handle.interrupted: return - logger.debug("VoiceAssistant._play_speech started") + logger.debug("_play_speech started", extra={"speech_id": speech_info.id}) user_question = speech_info.user_question user_speech_commited = False @@ -527,6 +541,9 @@ def _commit_user_question_if_needed() -> None: ): return + logger.debug( + "committed user transcript", extra={"user_transcript": user_question} + ) user_msg = ChatMessage.create(text=user_question, role="user") self._chat_ctx.messages.append(user_msg) self.emit("user_speech_committed", user_msg) @@ -573,6 +590,13 @@ def _commit_user_question_if_needed() -> None: for fnc in called_fncs_info: called_fnc = fnc.execute() called_fncs.append(called_fnc) + logger.debug( + "executing ai function", + extra={ + "function": fnc.function_info.name, + "speech_id": speech_info.id, + }, + ) try: await called_fnc.task except Exception: @@ -604,7 +628,9 @@ def _commit_user_question_if_needed() -> None: answer_llm_stream = self._llm.chat( chat_ctx=chat_ctx, fnc_ctx=self._fnc_ctx ) - answer_synthesis = self._synthesize_agent_speech(answer_llm_stream) + answer_synthesis = self._synthesize_agent_speech( + speech_info.id, answer_llm_stream + ) # replace the synthesis handle with the new one to allow interruption speech_info.synthesis_handle = answer_synthesis play_handle = answer_synthesis.play() @@ -624,10 +650,16 @@ def _commit_user_question_if_needed() -> None: else: self.emit("agent_speech_committed", msg) - logger.debug("VoiceAssistant._play_speech ended") + logger.debug( + "committed agent speech", + extra={"speech_id": speech_info.id, "interrupted": interrupted}, + ) + + logger.debug("_play_speech ended", extra={"speech_id": speech_info.id}) def _synthesize_agent_speech( self, + speech_id: str, source: str | LLMStream | AsyncIterable[str], ) -> SynthesisHandle: assert ( @@ -638,6 +670,7 @@ def _synthesize_agent_speech( source = _llm_stream_to_str_iterable(source) return self._agent_output.synthesize( + speech_id=speech_id, transcript=source, transcription=self._opts.transcription.agent_transcription, transcription_speed=self._opts.transcription.agent_transcription_speed, @@ -658,6 +691,10 @@ def _validate_reply_if_possible(self) -> None: if speech.allow_interruptions and speech.is_reply: speech.synthesis_handle.interrupt() + logger.debug( + "validated agent reply", + extra={"speech_id": self._pending_agent_reply.id}, + ) self._add_speech_for_playout(self._pending_agent_reply) self._pending_agent_reply = None elif not self._opts.preemptive_synthesis and self._transcribed_text: diff --git a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/__init__.py b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/__init__.py index 627afbae5..a5884c81e 100644 --- a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/__init__.py +++ b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/__init__.py @@ -18,13 +18,12 @@ from livekit.agents import Plugin +from .log import logger + class AzurePlugin(Plugin): def __init__(self): - super().__init__(__name__, __version__, __package__) - - def download_files(self): - pass + super().__init__(__name__, __version__, __package__, logger) Plugin.register_plugin(AzurePlugin()) diff --git a/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/__init__.py b/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/__init__.py index 3e4d48421..1f7001c95 100644 --- a/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/__init__.py +++ b/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/__init__.py @@ -19,13 +19,12 @@ from livekit.agents import Plugin +from .log import logger + class CartesiaPlugin(Plugin): def __init__(self): - super().__init__(__name__, __version__, __package__) - - def download_files(self): - pass + super().__init__(__name__, __version__, __package__, logger) Plugin.register_plugin(CartesiaPlugin()) diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/__init__.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/__init__.py index 60c5f74ba..5e8075b4a 100644 --- a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/__init__.py +++ b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/__init__.py @@ -6,13 +6,12 @@ from livekit.agents import Plugin +from .log import logger + class DeepgramPlugin(Plugin): def __init__(self): - super().__init__(__name__, __version__, __package__) - - def download_files(self): - pass + super().__init__(__name__, __version__, __package__, logger) Plugin.register_plugin(DeepgramPlugin()) diff --git a/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/__init__.py b/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/__init__.py index a59eb1983..4fb66ae80 100644 --- a/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/__init__.py +++ b/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/__init__.py @@ -28,13 +28,12 @@ from livekit.agents import Plugin +from .log import logger + class ElevenLabsPlugin(Plugin): def __init__(self): - super().__init__(__name__, __version__, __package__) - - def download_files(self): - pass + super().__init__(__name__, __version__, __package__, logger) Plugin.register_plugin(ElevenLabsPlugin()) diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/__init__.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/__init__.py index 4cd6710b8..4d7212140 100644 --- a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/__init__.py +++ b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/__init__.py @@ -20,13 +20,12 @@ from livekit.agents import Plugin +from .log import logger + class GooglePlugin(Plugin): def __init__(self): - super().__init__(__name__, __version__, __package__) - - def download_files(self): - pass + super().__init__(__name__, __version__, __package__, logger) Plugin.register_plugin(GooglePlugin()) diff --git a/livekit-plugins/livekit-plugins-minimal/livekit/plugins/minimal/__init__.py b/livekit-plugins/livekit-plugins-minimal/livekit/plugins/minimal/__init__.py index 25387a210..bf7992168 100644 --- a/livekit-plugins/livekit-plugins-minimal/livekit/plugins/minimal/__init__.py +++ b/livekit-plugins/livekit-plugins-minimal/livekit/plugins/minimal/__init__.py @@ -14,15 +14,13 @@ from livekit.agents import Plugin +from .log import logger from .version import __version__ class MinimalPlugin(Plugin): def __init__(self): - super().__init__(__name__, __version__, __package__) - - def download_files(self): - pass + super().__init__(__name__, __version__, __package__, logger) Plugin.register_plugin(MinimalPlugin()) diff --git a/livekit-plugins/livekit-plugins-minimal/livekit/plugins/minimal/log.py b/livekit-plugins/livekit-plugins-minimal/livekit/plugins/minimal/log.py new file mode 100644 index 000000000..fd880feac --- /dev/null +++ b/livekit-plugins/livekit-plugins-minimal/livekit/plugins/minimal/log.py @@ -0,0 +1,3 @@ +import logging + +logger = logging.getLogger("livekit.plugins.minimal") diff --git a/livekit-plugins/livekit-plugins-nltk/livekit/plugins/nltk/__init__.py b/livekit-plugins/livekit-plugins-nltk/livekit/plugins/nltk/__init__.py index f8fff0e9f..3f0e48514 100644 --- a/livekit-plugins/livekit-plugins-nltk/livekit/plugins/nltk/__init__.py +++ b/livekit-plugins/livekit-plugins-nltk/livekit/plugins/nltk/__init__.py @@ -23,10 +23,12 @@ import nltk # type: ignore +from .log import logger + class NltkPlugin(Plugin): def __init__(self): - super().__init__(__name__, __version__, __package__) + super().__init__(__name__, __version__, __package__, logger) def download_files(self): try: diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/__init__.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/__init__.py index b7dfb95c5..e0fa12e4b 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/__init__.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/__init__.py @@ -34,13 +34,12 @@ from livekit.agents import Plugin +from .log import logger + class OpenAIPlugin(Plugin): def __init__(self) -> None: - super().__init__(__name__, __version__, __package__) - - def download_files(self) -> None: - pass + super().__init__(__name__, __version__, __package__, logger) Plugin.register_plugin(OpenAIPlugin()) diff --git a/livekit-plugins/livekit-plugins-rag/livekit/plugins/rag/__init__.py b/livekit-plugins/livekit-plugins-rag/livekit/plugins/rag/__init__.py index 1157ad64f..3cd283ce6 100644 --- a/livekit-plugins/livekit-plugins-rag/livekit/plugins/rag/__init__.py +++ b/livekit-plugins/livekit-plugins-rag/livekit/plugins/rag/__init__.py @@ -20,13 +20,12 @@ from livekit.agents import Plugin +from .log import logger + class RAGPlugin(Plugin): def __init__(self) -> None: - super().__init__(__name__, __version__, __package__) - - def download_files(self) -> None: - pass + super().__init__(__name__, __version__, __package__, logger) Plugin.register_plugin(RAGPlugin()) diff --git a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/__init__.py b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/__init__.py index 21e25e474..8f3b7baf3 100644 --- a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/__init__.py +++ b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/__init__.py @@ -19,13 +19,12 @@ from livekit.agents import Plugin +from .log import logger + class SileroPlugin(Plugin): def __init__(self): - super().__init__(__name__, __version__, __package__) - - def download_files(self): - pass + super().__init__(__name__, __version__, __package__, logger) Plugin.register_plugin(SileroPlugin()) From 00430ce25210fc15fde0ae22305923beccf0ddc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Wed, 31 Jul 2024 23:03:28 +0200 Subject: [PATCH 12/24] attempt to fix flaky tests (#557) --- tests/test_ipc.py | 5 ++++- tests/test_llm.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_ipc.py b/tests/test_ipc.py index 0a4a86086..779bcb14b 100644 --- a/tests/test_ipc.py +++ b/tests/test_ipc.py @@ -315,10 +315,11 @@ async def test_shutdown_no_job(): proc, start_args = _create_proc(close_timeout=10.0, mp_ctx=mp_ctx) proc.start() await proc.initialize() + await asyncio.sleep(1.0) await proc.aclose() - assert not proc.killed assert proc.exitcode == 0 + assert not proc.killed assert ( start_args.shutdown_counter.value == 0 ), "shutdown_cb isn't called when there is no job" @@ -331,6 +332,7 @@ async def test_job_slow_shutdown(): proc.start() await proc.initialize() + await asyncio.sleep(1.0) fake_job = _generate_fake_job() await proc.launch_job(fake_job) @@ -348,6 +350,7 @@ async def test_job_graceful_shutdown(): start_args.shutdown_simulate_work_time = 1.0 proc.start() await proc.initialize() + await asyncio.sleep(1.0) fake_job = _generate_fake_job() await proc.launch_job(fake_job) diff --git a/tests/test_llm.py b/tests/test_llm.py index ae5f27131..97ff6f033 100644 --- a/tests/test_llm.py +++ b/tests/test_llm.py @@ -165,7 +165,7 @@ async def test_calls_arrays(): llm = openai.LLM(model="gpt-4o") stream = await _request_fnc_call( - llm, "Can you select all currencies in Europe?", fnc_ctx + llm, "Can you select all currencies in Europe at once?", fnc_ctx ) fns = stream.execute_functions() await asyncio.gather(*[f.task for f in fns]) From dc9647d990814f4df4ed9921d75865719df4bbb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Wed, 31 Jul 2024 23:30:01 +0200 Subject: [PATCH 13/24] reduce the default load threshold to a more appropriate default (#559) --- .changeset/witty-dolls-perform.md | 5 +++++ livekit-agents/livekit/agents/worker.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 .changeset/witty-dolls-perform.md diff --git a/.changeset/witty-dolls-perform.md b/.changeset/witty-dolls-perform.md new file mode 100644 index 000000000..f4ab48591 --- /dev/null +++ b/.changeset/witty-dolls-perform.md @@ -0,0 +1,5 @@ +--- +"livekit-agents": patch +--- + +reduce the default load threshold to a more appropriate default diff --git a/livekit-agents/livekit/agents/worker.py b/livekit-agents/livekit/agents/worker.py index eb7d147ec..40ead5e30 100644 --- a/livekit-agents/livekit/agents/worker.py +++ b/livekit-agents/livekit/agents/worker.py @@ -71,7 +71,7 @@ class WorkerOptions: request_fnc: Callable[[JobRequest], Coroutine] = _default_request_fnc prewarm_fnc: Callable[[JobProcess], Any] = _default_initialize_process_fnc load_fnc: Callable[[], float] = _default_cpu_load_fnc - load_threshold: float = 0.8 + load_threshold: float = 0.65 num_idle_processes: int = 3 shutdown_process_timeout: float = 60.0 initialize_process_timeout: float = 10.0 From fba5e0fba584ec23cbf82091958b984bcda9812e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Wed, 31 Jul 2024 23:36:54 +0200 Subject: [PATCH 14/24] warn no room connection after job_entry was called after 10 seconds. (#558) Co-authored-by: David Zhao --- .changeset/eleven-monkeys-serve.md | 5 +++++ livekit-agents/livekit/agents/ipc/proc_main.py | 13 +++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 .changeset/eleven-monkeys-serve.md diff --git a/.changeset/eleven-monkeys-serve.md b/.changeset/eleven-monkeys-serve.md new file mode 100644 index 000000000..7e30ac802 --- /dev/null +++ b/.changeset/eleven-monkeys-serve.md @@ -0,0 +1,5 @@ +--- +"livekit-agents": patch +--- + +warn no room connection after job_entry was called after 10 seconds. diff --git a/livekit-agents/livekit/agents/ipc/proc_main.py b/livekit-agents/livekit/agents/ipc/proc_main.py index 9f8da92c8..2d84651b9 100644 --- a/livekit-agents/livekit/agents/ipc/proc_main.py +++ b/livekit-agents/livekit/agents/ipc/proc_main.py @@ -95,6 +95,19 @@ async def _run_job_task() -> None: args.job_entrypoint_fnc(job_ctx), name="job_entrypoint" ) + async def _warn_not_connected_task(): + await asyncio.sleep(10) + if not ctx_connect and not ctx_shutdown: + logger.warn( + ( + "room not connected after job_entry was called after 10 seconds, " + "did you forget to call job_ctx.connect()?" + ) + ) + + warn_unconnected_task = asyncio.create_task(_warn_not_connected_task()) + job_entry_task.add_done_callback(lambda _: warn_unconnected_task.cancel()) + def log_exception(t: asyncio.Task) -> None: if not t.cancelled() and t.exception(): logger.error( From ab1c0c6e77e16519dfdfb3717a6b99c63e97131d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Thu, 1 Aug 2024 02:29:48 +0200 Subject: [PATCH 15/24] voiceassistant: tweaks & fix speech being removed too soon from the queue (#560) --- .changeset/warm-ducks-suffer.md | 5 ++ examples/voice-assistant/function_calling.py | 3 + .../agents/voice_assistant/agent_output.py | 8 +- .../agents/voice_assistant/agent_playout.py | 4 +- .../agents/voice_assistant/voice_assistant.py | 89 ++++++++++++------- 5 files changed, 74 insertions(+), 35 deletions(-) create mode 100644 .changeset/warm-ducks-suffer.md diff --git a/.changeset/warm-ducks-suffer.md b/.changeset/warm-ducks-suffer.md new file mode 100644 index 000000000..d43863e32 --- /dev/null +++ b/.changeset/warm-ducks-suffer.md @@ -0,0 +1,5 @@ +--- +"livekit-agents": patch +--- + +voiceassistant: tweaks & fix speech being removed too soon from the queue diff --git a/examples/voice-assistant/function_calling.py b/examples/voice-assistant/function_calling.py index 19425cd7a..9392bc900 100644 --- a/examples/voice-assistant/function_calling.py +++ b/examples/voice-assistant/function_calling.py @@ -3,10 +3,13 @@ import logging from typing import Annotated +from dotenv import load_dotenv from livekit.agents import AutoSubscribe, JobContext, WorkerOptions, cli, llm from livekit.agents.voice_assistant import VoiceAssistant from livekit.plugins import deepgram, openai, silero +load_dotenv() + logger = logging.getLogger("function-calling-demo") logger.setLevel(logging.INFO) diff --git a/livekit-agents/livekit/agents/voice_assistant/agent_output.py b/livekit-agents/livekit/agents/voice_assistant/agent_output.py index 183641311..052767acc 100644 --- a/livekit-agents/livekit/agents/voice_assistant/agent_output.py +++ b/livekit-agents/livekit/agents/voice_assistant/agent_output.py @@ -73,7 +73,7 @@ def interrupt(self) -> None: return logger.debug( - "SynthesisHandle.interrupt: interrupting synthesis", + "interrupting synthesis/playout", extra={"speech_id": self.speech_id}, ) @@ -179,10 +179,11 @@ async def _str_synthesis_task(text: str, handle: SynthesisHandle) -> None: if first_frame: first_frame = False logger.debug( - "AgentOutput._str_synthesis_task: received first frame of TTS", + "received first TTS frame", extra={ "speech_id": handle.speech_id, "elapsed": time.time() - start_time, + "streamed": False, }, ) @@ -211,10 +212,11 @@ async def _read_generated_audio_task(): if first_frame: first_frame = False logger.debug( - "AgentOutput._stream_synthesis_task: received first frame of TTS", + "first TTS frame", extra={ "speech_id": handle.speech_id, "elapsed": time.time() - start_time, + "streamed": True, }, ) diff --git a/livekit-agents/livekit/agents/voice_assistant/agent_playout.py b/livekit-agents/livekit/agents/voice_assistant/agent_playout.py index 10aedce74..4106a25ea 100644 --- a/livekit-agents/livekit/agents/voice_assistant/agent_playout.py +++ b/livekit-agents/livekit/agents/voice_assistant/agent_playout.py @@ -121,7 +121,7 @@ def _should_break(): handle._tr_fwd.segment_playout_started() logger.debug( - "AgentPlayout: first frame started playing", + "started playing the first frame", extra={"speech_id": handle.speech_id}, ) @@ -170,6 +170,6 @@ def _should_break(): await handle._tr_fwd.aclose() logger.debug( - "AgentPlayout: done playing", + "playout finished", extra={"speech_id": handle.speech_id, "cancelled": cancelled}, ) diff --git a/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py b/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py index 21266e62d..ef54224fe 100644 --- a/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py +++ b/livekit-agents/livekit/agents/voice_assistant/voice_assistant.py @@ -128,7 +128,7 @@ def __init__( chat_ctx: ChatContext | None = None, fnc_ctx: FunctionContext | None = None, allow_interruptions: bool = True, - interrupt_speech_duration: float = 0.6, + interrupt_speech_duration: float = 0.45, interrupt_min_words: int = 0, preemptive_synthesis: bool = True, transcription: AssistantTranscriptionOptions = AssistantTranscriptionOptions(), @@ -434,9 +434,10 @@ def _on_playout_stopped(cancelled: bool) -> None: await self._speech_q_changed.wait() while self._speech_q: - speech = self._speech_q.pop(0) + speech = self._speech_q[0] self._playing_speech = speech await self._play_speech(speech) + self._speech_q.pop(0) # Remove the element only after playing self._playing_speech = None self._speech_q_changed.clear() @@ -480,9 +481,9 @@ async def _synthesize_answer_task( logger.debug( "synthesizing agent reply", extra={ - "speech_id": reply.id, "user_transcript": user_transcript, "validated": validated, + "speech_id": reply.id, }, ) @@ -507,12 +508,11 @@ async def _play_speech(self, speech_info: _SpeechInfo) -> None: if synthesis_handle.interrupted: return - logger.debug("_play_speech started", extra={"speech_id": speech_info.id}) - user_question = speech_info.user_question user_speech_commited = False play_handle = synthesis_handle.play() + join_fut = play_handle.join() def _commit_user_question_if_needed() -> None: nonlocal user_speech_commited @@ -554,7 +554,6 @@ def _commit_user_question_if_needed() -> None: # wait for the play_handle to finish and check every 1s if the user question should be committed _commit_user_question_if_needed() - join_fut = play_handle.join() while not join_fut.done(): await asyncio.wait( [join_fut], return_when=asyncio.FIRST_COMPLETED, timeout=1.0 @@ -652,11 +651,13 @@ def _commit_user_question_if_needed() -> None: logger.debug( "committed agent speech", - extra={"speech_id": speech_info.id, "interrupted": interrupted}, + extra={ + "agent_transcript": collected_text, + "interrupted": interrupted, + "speech_id": speech_info.id, + }, ) - logger.debug("_play_speech ended", extra={"speech_id": speech_info.id}) - def _synthesize_agent_speech( self, speech_id: str, @@ -667,7 +668,7 @@ def _synthesize_agent_speech( ), "agent output should be initialized when ready" if isinstance(source, LLMStream): - source = _llm_stream_to_str_iterable(source) + source = _llm_stream_to_str_iterable(speech_id, source) return self._agent_output.synthesize( speech_id=speech_id, @@ -729,12 +730,23 @@ def _add_speech_for_playout(self, speech: _SpeechInfo) -> None: self._speech_q_changed.set() -async def _llm_stream_to_str_iterable(stream: LLMStream) -> AsyncIterable[str]: +async def _llm_stream_to_str_iterable( + speech_id: str, stream: LLMStream +) -> AsyncIterable[str]: + start_time = time.time() + first_frame = True async for chunk in stream: content = chunk.choices[0].delta.content if content is None: continue + if first_frame: + first_frame = False + logger.debug( + "first LLM token", + extra={"speech_id": speech_id, "elapsed": time.time() - start_time}, + ) + yield content @@ -743,9 +755,11 @@ class _DeferredReplyValidation: # if the STT gives us punctuation, we can try validate the reply faster. PUNCTUATION = ".!?" - DEFER_DELAY_WITH_PUNCTUATION = 0.1 - DEFER_DELAY = 0.2 - LATE_TRANSCRIPT_TOLERANCE = 5 + PUNCTUATION_REDUCE_FACTOR = 0.8 + + DEFER_DELAY_END_OF_SPEECH = 0.2 + DEFER_DELAY_FINAL_TRANSCRIPT = 1.0 + LATE_TRANSCRIPT_TOLERANCE = 1.5 # late compared to end of speech def __init__( self, validate_fnc: Callable[[], None], loop: asyncio.AbstractEventLoop @@ -756,6 +770,7 @@ def __init__( self._validating_task: asyncio.Task | None = None self._last_final_transcript: str = "" self._last_recv_end_of_speech_time: float = 0.0 + self._speaking = False @property def validating(self) -> bool: @@ -764,26 +779,43 @@ def validating(self) -> bool: def on_human_final_transcript(self, transcript: str) -> None: self._last_final_transcript = transcript.strip() # type: ignore - if self.validating: - self._run(self._get_defer_delay()) # debounce - elif ( - self._last_recv_end_of_speech_time - and time.time() - self._last_recv_end_of_speech_time + if self._speaking: + return + + has_recent_end_of_speech = ( + time.time() - self._last_recv_end_of_speech_time < self.LATE_TRANSCRIPT_TOLERANCE - ): - # final transcript was received after human stopped speaking - self._run(self._get_defer_delay()) + ) + delay = ( + self.DEFER_DELAY_END_OF_SPEECH + if has_recent_end_of_speech + else self.DEFER_DELAY_FINAL_TRANSCRIPT + ) + delay = ( + delay * self.PUNCTUATION_REDUCE_FACTOR + if self._end_with_punctuation() + else 1.0 + ) + + self._run(delay) def on_human_start_of_speech(self, ev: vad.VADEvent) -> None: + self._speaking = True if self.validating: assert self._validating_task is not None self._validating_task.cancel() def on_human_end_of_speech(self, ev: vad.VADEvent) -> None: + self._speaking = False self._last_recv_end_of_speech_time = time.time() if self._last_final_transcript: - self._run(self._get_defer_delay()) + delay = ( + self.DEFER_DELAY_END_OF_SPEECH * self.PUNCTUATION_REDUCE_FACTOR + if self._end_with_punctuation() + else 1.0 + ) + self._run(delay) async def aclose(self) -> None: if self._validating_task is not None: @@ -791,14 +823,11 @@ async def aclose(self) -> None: await self._tasks_set.aclose() - def _get_defer_delay(self) -> float: - if ( - self._last_final_transcript + def _end_with_punctuation(self) -> bool: + return ( + len(self._last_final_transcript) > 0 and self._last_final_transcript[-1] in self.PUNCTUATION - ): - return self.DEFER_DELAY_WITH_PUNCTUATION - - return self.DEFER_DELAY + ) def _reset_states(self) -> None: self._last_final_transcript = "" From 384c66efb1b002b6966c05db04d9ebc64053c653 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Thu, 1 Aug 2024 03:11:48 +0200 Subject: [PATCH 16/24] deepgram: reduce chunks size to 100ms (#561) --- .changeset/gorgeous-teachers-rule.md | 6 ++++++ livekit-agents/livekit/agents/utils/audio.py | 9 ++++----- .../livekit/plugins/deepgram/stt.py | 6 +++--- 3 files changed, 13 insertions(+), 8 deletions(-) create mode 100644 .changeset/gorgeous-teachers-rule.md diff --git a/.changeset/gorgeous-teachers-rule.md b/.changeset/gorgeous-teachers-rule.md new file mode 100644 index 000000000..a10f1b9dc --- /dev/null +++ b/.changeset/gorgeous-teachers-rule.md @@ -0,0 +1,6 @@ +--- +"livekit-agents": patch +"livekit-plugins-deepgram": patch +--- + +deepgram: reduce chunks size to 100ms diff --git a/livekit-agents/livekit/agents/utils/audio.py b/livekit-agents/livekit/agents/utils/audio.py index 99472b3d9..c31dcd5f9 100644 --- a/livekit-agents/livekit/agents/utils/audio.py +++ b/livekit-agents/livekit/agents/utils/audio.py @@ -12,17 +12,16 @@ def __init__( self, sample_rate: int, num_channels: int, - samples_per_frame: int | None = None, + samples_per_channel: int | None = None, ) -> None: self._sample_rate = sample_rate self._num_channels = num_channels - if samples_per_frame is None: - samples_per_frame = sample_rate // 100 # 10ms by default + if samples_per_channel is None: + samples_per_channel = sample_rate // 50 # 20ms by default - self._samples_per_frame = samples_per_frame self._bytes_per_frame = ( - num_channels * samples_per_frame * ctypes.sizeof(ctypes.c_int16) + num_channels * samples_per_channel * ctypes.sizeof(ctypes.c_int16) ) self._buf = bytearray() diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py index e71ac09b7..81b3cdfbe 100644 --- a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py +++ b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py @@ -256,12 +256,12 @@ async def keepalive_task(): async def send_task(): nonlocal closing_ws - # forward audio to deepgram in chunks of 200ms - ms200 = self._opts.sample_rate // 5 * self._opts.num_channels + # forward audio to deepgram in chunks of 100ms + samples_100ms = self._opts.sample_rate // 10 audio_bstream = utils.audio.AudioByteStream( sample_rate=self._opts.sample_rate, num_channels=self._opts.num_channels, - samples_per_frame=ms200, + samples_per_channel=samples_100ms, ) async for data in self._input_ch: From 2a4a24057996de9897141958686bf594e655affa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Thu, 1 Aug 2024 03:14:14 +0200 Subject: [PATCH 17/24] silero: bring back expfilter (#562) --- .changeset/slimy-seas-hammer.md | 5 +++++ .../livekit-plugins-silero/livekit/plugins/silero/vad.py | 7 +++++++ 2 files changed, 12 insertions(+) create mode 100644 .changeset/slimy-seas-hammer.md diff --git a/.changeset/slimy-seas-hammer.md b/.changeset/slimy-seas-hammer.md new file mode 100644 index 000000000..c20edcf01 --- /dev/null +++ b/.changeset/slimy-seas-hammer.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-silero": patch +--- + +silero: bring back expfilter diff --git a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py index 7673a9828..9a30b34cb 100644 --- a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py +++ b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py @@ -22,6 +22,7 @@ import numpy as np import onnxruntime # type: ignore from livekit import agents, rtc +from livekit.agents import utils from . import onnx_model from .log import logger @@ -104,6 +105,7 @@ def __init__(self, opts: _VADOptions, model: onnx_model.OnnxModel) -> None: self._executor = ThreadPoolExecutor(max_workers=1) self._task.add_done_callback(lambda _: self._executor.shutdown(wait=False)) + self._exp_filter = utils.ExpFilter(alpha=0.5) @agents.utils.log_exceptions(logger=logger) async def _main_task(self): @@ -184,6 +186,11 @@ async def _main_task(self): raw_prob = await self._loop.run_in_executor( self._executor, self._model, inference_window_data ) + + prob_change = abs(raw_prob - self._exp_filter.filtered()) + exp = 0.4 if prob_change > 0.45 else 1 + raw_prob = self._exp_filter.apply(exp=exp, sample=raw_prob) + inference_duration = time.time() - start_time window_duration = ( self._model.window_size_samples / self._opts.sample_rate From c862923adde25fb66a95213fcd83458064062eb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Thu, 1 Aug 2024 22:24:52 +0200 Subject: [PATCH 18/24] silero: tiny tweaks (#565) --- .changeset/clever-poets-behave.md | 5 +++++ .../livekit-plugins-silero/livekit/plugins/silero/vad.py | 6 +++--- 2 files changed, 8 insertions(+), 3 deletions(-) create mode 100644 .changeset/clever-poets-behave.md diff --git a/.changeset/clever-poets-behave.md b/.changeset/clever-poets-behave.md new file mode 100644 index 000000000..ec040226b --- /dev/null +++ b/.changeset/clever-poets-behave.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-silero": patch +--- + +silero: tiny tweaks diff --git a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py index 9a30b34cb..b228f02cf 100644 --- a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py +++ b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py @@ -44,7 +44,7 @@ def load( cls, *, min_speech_duration: float = 0.05, - min_silence_duration: float = 0.1, + min_silence_duration: float = 0.25, padding_duration: float = 0.1, max_buffered_speech: float = 60.0, activation_threshold: float = 0.25, @@ -105,7 +105,7 @@ def __init__(self, opts: _VADOptions, model: onnx_model.OnnxModel) -> None: self._executor = ThreadPoolExecutor(max_workers=1) self._task.add_done_callback(lambda _: self._executor.shutdown(wait=False)) - self._exp_filter = utils.ExpFilter(alpha=0.5) + self._exp_filter = utils.ExpFilter(alpha=0.35) @agents.utils.log_exceptions(logger=logger) async def _main_task(self): @@ -188,7 +188,7 @@ async def _main_task(self): ) prob_change = abs(raw_prob - self._exp_filter.filtered()) - exp = 0.4 if prob_change > 0.45 else 1 + exp = 0.5 if prob_change > 0.25 else 1 raw_prob = self._exp_filter.apply(exp=exp, sample=raw_prob) inference_duration = time.time() - start_time From 8ce90df03ec2e81dad0a56f66abb21b086704803 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 1 Aug 2024 22:26:53 +0200 Subject: [PATCH 19/24] Version Packages (#552) Co-authored-by: github-actions[bot] --- .changeset/angry-books-eat.md | 5 ---- .changeset/clever-poets-behave.md | 5 ---- .changeset/eleven-kings-push.md | 5 ---- .changeset/eleven-monkeys-serve.md | 5 ---- .changeset/gorgeous-teachers-rule.md | 6 ----- .changeset/hot-cows-wash.md | 5 ---- .changeset/many-clocks-punch.md | 5 ---- .changeset/perfect-doors-play.md | 5 ---- .changeset/pretty-rivers-care.md | 5 ---- .changeset/purple-garlics-approve.md | 5 ---- .changeset/six-badgers-sneeze.md | 5 ---- .changeset/slimy-seas-hammer.md | 5 ---- .changeset/warm-ducks-suffer.md | 5 ---- .changeset/wicked-rats-exist.md | 5 ---- .changeset/witty-dolls-perform.md | 5 ---- examples/simple-color/requirements.txt | 2 +- examples/speech-to-text/requirements.txt | 4 +-- examples/text-to-speech/requirements.txt | 2 +- examples/voice-assistant/requirements.txt | 6 ++--- livekit-agents/CHANGELOG.md | 26 +++++++++++++++++++ livekit-agents/livekit/agents/version.py | 2 +- livekit-agents/package.json | 2 +- .../livekit-plugins-deepgram/CHANGELOG.md | 8 ++++++ .../livekit/plugins/deepgram/version.py | 2 +- .../livekit-plugins-deepgram/package.json | 2 +- .../livekit-plugins-silero/CHANGELOG.md | 10 +++++++ .../livekit/plugins/silero/version.py | 2 +- .../livekit-plugins-silero/package.json | 2 +- 28 files changed, 57 insertions(+), 89 deletions(-) delete mode 100644 .changeset/angry-books-eat.md delete mode 100644 .changeset/clever-poets-behave.md delete mode 100644 .changeset/eleven-kings-push.md delete mode 100644 .changeset/eleven-monkeys-serve.md delete mode 100644 .changeset/gorgeous-teachers-rule.md delete mode 100644 .changeset/hot-cows-wash.md delete mode 100644 .changeset/many-clocks-punch.md delete mode 100644 .changeset/perfect-doors-play.md delete mode 100644 .changeset/pretty-rivers-care.md delete mode 100644 .changeset/purple-garlics-approve.md delete mode 100644 .changeset/six-badgers-sneeze.md delete mode 100644 .changeset/slimy-seas-hammer.md delete mode 100644 .changeset/warm-ducks-suffer.md delete mode 100644 .changeset/wicked-rats-exist.md delete mode 100644 .changeset/witty-dolls-perform.md diff --git a/.changeset/angry-books-eat.md b/.changeset/angry-books-eat.md deleted file mode 100644 index 8c09f6c2e..000000000 --- a/.changeset/angry-books-eat.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-agents": patch ---- - -voiceassistant: run function calls sequentially diff --git a/.changeset/clever-poets-behave.md b/.changeset/clever-poets-behave.md deleted file mode 100644 index ec040226b..000000000 --- a/.changeset/clever-poets-behave.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-plugins-silero": patch ---- - -silero: tiny tweaks diff --git a/.changeset/eleven-kings-push.md b/.changeset/eleven-kings-push.md deleted file mode 100644 index 332b38e99..000000000 --- a/.changeset/eleven-kings-push.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-agents": patch ---- - -configure plugins loggers & more debug logs on the voiceassistant diff --git a/.changeset/eleven-monkeys-serve.md b/.changeset/eleven-monkeys-serve.md deleted file mode 100644 index 7e30ac802..000000000 --- a/.changeset/eleven-monkeys-serve.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-agents": patch ---- - -warn no room connection after job_entry was called after 10 seconds. diff --git a/.changeset/gorgeous-teachers-rule.md b/.changeset/gorgeous-teachers-rule.md deleted file mode 100644 index a10f1b9dc..000000000 --- a/.changeset/gorgeous-teachers-rule.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -"livekit-agents": patch -"livekit-plugins-deepgram": patch ---- - -deepgram: reduce chunks size to 100ms diff --git a/.changeset/hot-cows-wash.md b/.changeset/hot-cows-wash.md deleted file mode 100644 index 04ed627c0..000000000 --- a/.changeset/hot-cows-wash.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-plugins-silero": patch ---- - -silero: optimize numpy input buffers diff --git a/.changeset/many-clocks-punch.md b/.changeset/many-clocks-punch.md deleted file mode 100644 index fa3125796..000000000 --- a/.changeset/many-clocks-punch.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-agents": patch ---- - -voiceassistant: cleanup validation behaviour #545 diff --git a/.changeset/perfect-doors-play.md b/.changeset/perfect-doors-play.md deleted file mode 100644 index 9479814e0..000000000 --- a/.changeset/perfect-doors-play.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-plugins-deepgram": patch ---- - -deepgram: segment audio frames into 200ms intervals before sending to the websocket #549 diff --git a/.changeset/pretty-rivers-care.md b/.changeset/pretty-rivers-care.md deleted file mode 100644 index 1c7cf200b..000000000 --- a/.changeset/pretty-rivers-care.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-agents": patch ---- - -voiceassistant: commit user question directly when allow_interruptions=False diff --git a/.changeset/purple-garlics-approve.md b/.changeset/purple-garlics-approve.md deleted file mode 100644 index a8aaaca09..000000000 --- a/.changeset/purple-garlics-approve.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-agents": patch ---- - -ipc: increase high ping threshold diff --git a/.changeset/six-badgers-sneeze.md b/.changeset/six-badgers-sneeze.md deleted file mode 100644 index 331bc63fe..000000000 --- a/.changeset/six-badgers-sneeze.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-agents": patch ---- - -voiceassistant: interrupt on final transcript diff --git a/.changeset/slimy-seas-hammer.md b/.changeset/slimy-seas-hammer.md deleted file mode 100644 index c20edcf01..000000000 --- a/.changeset/slimy-seas-hammer.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-plugins-silero": patch ---- - -silero: bring back expfilter diff --git a/.changeset/warm-ducks-suffer.md b/.changeset/warm-ducks-suffer.md deleted file mode 100644 index d43863e32..000000000 --- a/.changeset/warm-ducks-suffer.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-agents": patch ---- - -voiceassistant: tweaks & fix speech being removed too soon from the queue diff --git a/.changeset/wicked-rats-exist.md b/.changeset/wicked-rats-exist.md deleted file mode 100644 index bf9c6fe7c..000000000 --- a/.changeset/wicked-rats-exist.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-agents": patch ---- - -voiceassistant: fix duplicate answers diff --git a/.changeset/witty-dolls-perform.md b/.changeset/witty-dolls-perform.md deleted file mode 100644 index f4ab48591..000000000 --- a/.changeset/witty-dolls-perform.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-agents": patch ---- - -reduce the default load threshold to a more appropriate default diff --git a/examples/simple-color/requirements.txt b/examples/simple-color/requirements.txt index b23dc5884..62d646a7b 100644 --- a/examples/simple-color/requirements.txt +++ b/examples/simple-color/requirements.txt @@ -1 +1 @@ -livekit-agents>=0.8.2 +livekit-agents>=0.8.3 diff --git a/examples/speech-to-text/requirements.txt b/examples/speech-to-text/requirements.txt index daa1f1918..36fe12113 100644 --- a/examples/speech-to-text/requirements.txt +++ b/examples/speech-to-text/requirements.txt @@ -1,2 +1,2 @@ -livekit-agents>=0.8.2 -livekit-plugins-deepgram>=0.6.1 +livekit-agents>=0.8.3 +livekit-plugins-deepgram>=0.6.2 diff --git a/examples/text-to-speech/requirements.txt b/examples/text-to-speech/requirements.txt index d33e84495..d400a0e16 100644 --- a/examples/text-to-speech/requirements.txt +++ b/examples/text-to-speech/requirements.txt @@ -1,2 +1,2 @@ -livekit-agents>=0.8.2 +livekit-agents>=0.8.3 livekit-plugins-openai>=0.7.1 diff --git a/examples/voice-assistant/requirements.txt b/examples/voice-assistant/requirements.txt index 4e53fbbcb..b2fbda0fa 100644 --- a/examples/voice-assistant/requirements.txt +++ b/examples/voice-assistant/requirements.txt @@ -1,5 +1,5 @@ -livekit-agents>=0.8.2 +livekit-agents>=0.8.3 livekit-plugins-openai>=0.7.1 -livekit-plugins-deepgram>=0.6.1 -livekit-plugins-silero>=0.6.1 +livekit-plugins-deepgram>=0.6.2 +livekit-plugins-silero>=0.6.2 python-dotenv~=1.0 \ No newline at end of file diff --git a/livekit-agents/CHANGELOG.md b/livekit-agents/CHANGELOG.md index 1c2815bfb..a049fe994 100644 --- a/livekit-agents/CHANGELOG.md +++ b/livekit-agents/CHANGELOG.md @@ -1,5 +1,31 @@ # livekit-agents +## 0.8.3 + +### Patch Changes + +- voiceassistant: run function calls sequentially - [#554](https://github.com/livekit/agents/pull/554) ([@theomonnom](https://github.com/theomonnom)) + +- configure plugins loggers & more debug logs on the voiceassistant - [#555](https://github.com/livekit/agents/pull/555) ([@theomonnom](https://github.com/theomonnom)) + +- warn no room connection after job_entry was called after 10 seconds. - [#558](https://github.com/livekit/agents/pull/558) ([@theomonnom](https://github.com/theomonnom)) + +- deepgram: reduce chunks size to 100ms - [#561](https://github.com/livekit/agents/pull/561) ([@theomonnom](https://github.com/theomonnom)) + +- voiceassistant: cleanup validation behaviour #545 - [#553](https://github.com/livekit/agents/pull/553) ([@theomonnom](https://github.com/theomonnom)) + +- voiceassistant: commit user question directly when allow_interruptions=False - [#547](https://github.com/livekit/agents/pull/547) ([@theomonnom](https://github.com/theomonnom)) + +- ipc: increase high ping threshold - [#556](https://github.com/livekit/agents/pull/556) ([@theomonnom](https://github.com/theomonnom)) + +- voiceassistant: interrupt on final transcript - [#546](https://github.com/livekit/agents/pull/546) ([@theomonnom](https://github.com/theomonnom)) + +- voiceassistant: tweaks & fix speech being removed too soon from the queue - [#560](https://github.com/livekit/agents/pull/560) ([@theomonnom](https://github.com/theomonnom)) + +- voiceassistant: fix duplicate answers - [#548](https://github.com/livekit/agents/pull/548) ([@theomonnom](https://github.com/theomonnom)) + +- reduce the default load threshold to a more appropriate default - [#559](https://github.com/livekit/agents/pull/559) ([@theomonnom](https://github.com/theomonnom)) + ## 0.8.2 ### Patch Changes diff --git a/livekit-agents/livekit/agents/version.py b/livekit-agents/livekit/agents/version.py index 7c3dfeb0f..a2e0e23e4 100644 --- a/livekit-agents/livekit/agents/version.py +++ b/livekit-agents/livekit/agents/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.8.2" +__version__ = "0.8.3" diff --git a/livekit-agents/package.json b/livekit-agents/package.json index 90c9f1fbd..e185d0dde 100644 --- a/livekit-agents/package.json +++ b/livekit-agents/package.json @@ -1,5 +1,5 @@ { "name": "livekit-agents", "private": true, - "version": "0.8.2" + "version": "0.8.3" } diff --git a/livekit-plugins/livekit-plugins-deepgram/CHANGELOG.md b/livekit-plugins/livekit-plugins-deepgram/CHANGELOG.md index 5f387508d..e5816b678 100644 --- a/livekit-plugins/livekit-plugins-deepgram/CHANGELOG.md +++ b/livekit-plugins/livekit-plugins-deepgram/CHANGELOG.md @@ -1,5 +1,13 @@ # livekit-plugins-deepgram +## 0.6.2 + +### Patch Changes + +- deepgram: reduce chunks size to 100ms - [#561](https://github.com/livekit/agents/pull/561) ([@theomonnom](https://github.com/theomonnom)) + +- deepgram: segment audio frames into 200ms intervals before sending to the websocket #549 - [#553](https://github.com/livekit/agents/pull/553) ([@theomonnom](https://github.com/theomonnom)) + ## 0.6.1 ### Patch Changes diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/version.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/version.py index ee48274f0..61bb6ddc4 100644 --- a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/version.py +++ b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.6.1" +__version__ = "0.6.2" diff --git a/livekit-plugins/livekit-plugins-deepgram/package.json b/livekit-plugins/livekit-plugins-deepgram/package.json index 8e7cc15aa..a1a670da7 100644 --- a/livekit-plugins/livekit-plugins-deepgram/package.json +++ b/livekit-plugins/livekit-plugins-deepgram/package.json @@ -1,5 +1,5 @@ { "name": "livekit-plugins-deepgram", "private": true, - "version": "0.6.1" + "version": "0.6.2" } diff --git a/livekit-plugins/livekit-plugins-silero/CHANGELOG.md b/livekit-plugins/livekit-plugins-silero/CHANGELOG.md index adeb075a6..e95f546a0 100644 --- a/livekit-plugins/livekit-plugins-silero/CHANGELOG.md +++ b/livekit-plugins/livekit-plugins-silero/CHANGELOG.md @@ -1,5 +1,15 @@ # livekit-plugins-silero +## 0.6.2 + +### Patch Changes + +- silero: tiny tweaks - [#565](https://github.com/livekit/agents/pull/565) ([@theomonnom](https://github.com/theomonnom)) + +- silero: optimize numpy input buffers - [#550](https://github.com/livekit/agents/pull/550) ([@theomonnom](https://github.com/theomonnom)) + +- silero: bring back expfilter - [#562](https://github.com/livekit/agents/pull/562) ([@theomonnom](https://github.com/theomonnom)) + ## 0.6.1 ### Patch Changes diff --git a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/version.py b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/version.py index ee48274f0..61bb6ddc4 100644 --- a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/version.py +++ b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.6.1" +__version__ = "0.6.2" diff --git a/livekit-plugins/livekit-plugins-silero/package.json b/livekit-plugins/livekit-plugins-silero/package.json index ca1649244..aa75e10d1 100644 --- a/livekit-plugins/livekit-plugins-silero/package.json +++ b/livekit-plugins/livekit-plugins-silero/package.json @@ -1,5 +1,5 @@ { "name": "livekit-plugins-silero", "private": true, - "version": "0.6.1" + "version": "0.6.2" } From 4e346c0322127b8bd99fdbf8f255e9919c638036 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Sat, 3 Aug 2024 03:20:54 +0200 Subject: [PATCH 20/24] silero: fix high cpu usage (#569) --- .changeset/red-numbers-complain.md | 5 +++++ .../livekit/plugins/silero/onnx_model.py | 6 ++++-- 2 files changed, 9 insertions(+), 2 deletions(-) create mode 100644 .changeset/red-numbers-complain.md diff --git a/.changeset/red-numbers-complain.md b/.changeset/red-numbers-complain.md new file mode 100644 index 000000000..d5675a6bc --- /dev/null +++ b/.changeset/red-numbers-complain.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-silero": patch +--- + +silero: fix high cpu usage diff --git a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/onnx_model.py b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/onnx_model.py index b0f24e564..a67b659a3 100644 --- a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/onnx_model.py +++ b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/onnx_model.py @@ -21,13 +21,15 @@ def new_inference_session(force_cpu: bool) -> onnxruntime.InferenceSession: path = str(_resource_files.enter_context(ctx)) opts = onnxruntime.SessionOptions() + opts.add_session_config_entry("session.intra_op.allow_spinning", "0") + opts.add_session_config_entry("session.inter_op.allow_spinning", "0") opts.inter_op_num_threads = 1 opts.intra_op_num_threads = 1 - opts.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL + opts.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL if force_cpu and "CPUExecutionProvider" in onnxruntime.get_available_providers(): session = onnxruntime.InferenceSession( - path, providers=["CPUExecutionProvider"], ess_options=opts + path, providers=["CPUExecutionProvider"], sess_options=opts ) else: session = onnxruntime.InferenceSession(path, sess_options=opts) From 9a72cccdd91c11af32c06e3255b17628c03a6041 Mon Sep 17 00:00:00 2001 From: David Zhao Date: Fri, 2 Aug 2024 18:38:16 -0700 Subject: [PATCH 21/24] Include chat handling with minimal_assistant (#568) --- examples/speech-to-text/deepgram_stt.py | 8 ++------ examples/voice-assistant/minimal_assistant.py | 19 ++++++++++++++++++- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/examples/speech-to-text/deepgram_stt.py b/examples/speech-to-text/deepgram_stt.py index 95f45a402..6a8cc100e 100644 --- a/examples/speech-to-text/deepgram_stt.py +++ b/examples/speech-to-text/deepgram_stt.py @@ -32,7 +32,6 @@ async def _forward_transcription( async def entrypoint(ctx: JobContext): logger.info("starting speech-to-text example") stt = deepgram.STT() - tasks = [] async def transcribe_track(participant: rtc.RemoteParticipant, track: rtc.Track): audio_stream = rtc.AudioStream(track) @@ -40,10 +39,7 @@ async def transcribe_track(participant: rtc.RemoteParticipant, track: rtc.Track) room=ctx.room, participant=participant, track=track ) stt_stream = stt.stream() - stt_task = asyncio.create_task( - _forward_transcription(stt_stream, stt_forwarder) - ) - tasks.append(stt_task) + asyncio.create_task(_forward_transcription(stt_stream, stt_forwarder)) async for ev in audio_stream: stt_stream.push_frame(ev.frame) @@ -57,7 +53,7 @@ def on_track_subscribed( participant: rtc.RemoteParticipant, ): if track.kind == rtc.TrackKind.KIND_AUDIO: - tasks.append(asyncio.create_task(transcribe_track(participant, track))) + asyncio.create_task(transcribe_track(participant, track)) if __name__ == "__main__": diff --git a/examples/voice-assistant/minimal_assistant.py b/examples/voice-assistant/minimal_assistant.py index 976fb1243..2ae520663 100644 --- a/examples/voice-assistant/minimal_assistant.py +++ b/examples/voice-assistant/minimal_assistant.py @@ -1,6 +1,7 @@ import asyncio from dotenv import load_dotenv +from livekit import rtc from livekit.agents import AutoSubscribe, JobContext, WorkerOptions, cli, llm from livekit.agents.voice_assistant import VoiceAssistant from livekit.plugins import deepgram, openai, silero @@ -19,15 +20,31 @@ async def entrypoint(ctx: JobContext): await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY) + llm_plugin = openai.LLM() assistant = VoiceAssistant( vad=silero.VAD.load(), stt=deepgram.STT(), - llm=openai.LLM(), + llm=llm_plugin, tts=openai.TTS(), chat_ctx=initial_ctx, ) assistant.start(ctx.room) + # listen to incoming chat messages, only required if you'd like the agent to + # answer incoming messages from Chat + chat = rtc.ChatManager(ctx.room) + + async def answer_from_text(txt: str): + chat_ctx = assistant.chat_ctx.copy() + chat_ctx.append(role="user", text=txt) + stream = llm_plugin.chat(chat_ctx=chat_ctx) + await assistant.say(stream) + + @chat.on("message_received") + def on_chat_received(msg: rtc.ChatMessage): + if msg.message: + asyncio.create_task(answer_from_text(msg.message)) + await asyncio.sleep(1) await assistant.say("Hey, how can I help you today?", allow_interruptions=True) From 88e75d71615a10d6f4d62b77764edba997e73499 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Sat, 3 Aug 2024 03:39:07 +0200 Subject: [PATCH 22/24] rag: add missing logger file (#571) --- .changeset/wet-crabs-rhyme.md | 5 +++++ .github/workflows/check-types.yml | 2 ++ .../livekit-plugins-rag/livekit/plugins/rag/chunking.py | 4 ++-- .../livekit-plugins-rag/livekit/plugins/rag/log.py | 3 +++ 4 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 .changeset/wet-crabs-rhyme.md create mode 100644 livekit-plugins/livekit-plugins-rag/livekit/plugins/rag/log.py diff --git a/.changeset/wet-crabs-rhyme.md b/.changeset/wet-crabs-rhyme.md new file mode 100644 index 000000000..a30d70d8a --- /dev/null +++ b/.changeset/wet-crabs-rhyme.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-rag": patch +--- + +rag: add missing logger file diff --git a/.github/workflows/check-types.yml b/.github/workflows/check-types.yml index aa38c7bef..aa560e70c 100644 --- a/.github/workflows/check-types.yml +++ b/.github/workflows/check-types.yml @@ -39,6 +39,7 @@ jobs: ./livekit-plugins/livekit-plugins-silero \ ./livekit-plugins/livekit-plugins-elevenlabs \ ./livekit-plugins/livekit-plugins-cartesia \ + ./livekit-plugins/livekit-plugins-rag \ ./livekit-plugins/livekit-plugins-azure - name: Install stub packages @@ -65,4 +66,5 @@ jobs: -p livekit.plugins.silero \ -p livekit.plugins.elevenlabs \ -p livekit.plugins.cartesia \ + -p livekit.plugins.rag \ -p livekit.plugins.azure diff --git a/livekit-plugins/livekit-plugins-rag/livekit/plugins/rag/chunking.py b/livekit-plugins/livekit-plugins-rag/livekit/plugins/rag/chunking.py index 2a2be3402..f866ec6b9 100644 --- a/livekit-plugins/livekit-plugins-rag/livekit/plugins/rag/chunking.py +++ b/livekit-plugins/livekit-plugins-rag/livekit/plugins/rag/chunking.py @@ -26,9 +26,9 @@ def __init__( def chunk(self, *, text: str) -> list[str]: chunks = [] - buf_words = [] + buf_words: list[str] = [] for paragraph in self._paragraph_tokenizer(text): - last_buf_words = [] + last_buf_words: list[str] = [] for sentence in self._sentence_tokenizer.tokenize(text=paragraph): for word in self._word_tokenizer.tokenize(text=sentence): diff --git a/livekit-plugins/livekit-plugins-rag/livekit/plugins/rag/log.py b/livekit-plugins/livekit-plugins-rag/livekit/plugins/rag/log.py new file mode 100644 index 000000000..2ffcf32bb --- /dev/null +++ b/livekit-plugins/livekit-plugins-rag/livekit/plugins/rag/log.py @@ -0,0 +1,3 @@ +import logging + +logger = logging.getLogger("livekit.plugins.rag") From 297db92d1732b0b89b093336725d945cc0b6872b Mon Sep 17 00:00:00 2001 From: aoife cassidy Date: Fri, 2 Aug 2024 20:26:07 -0700 Subject: [PATCH 23/24] elevenlabs: error on non-PCM data --- .changeset/violet-students-shout.md | 5 +++ .../livekit/plugins/elevenlabs/tts.py | 42 ++++++++++++++----- 2 files changed, 36 insertions(+), 11 deletions(-) create mode 100644 .changeset/violet-students-shout.md diff --git a/.changeset/violet-students-shout.md b/.changeset/violet-students-shout.md new file mode 100644 index 000000000..0ac116426 --- /dev/null +++ b/.changeset/violet-students-shout.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-elevenlabs": patch +--- + +gracefully error on non-PCM data diff --git a/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/tts.py b/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/tts.py index 701c5db7c..dd509c11a 100644 --- a/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/tts.py +++ b/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/tts.py @@ -156,6 +156,8 @@ def __init__( ) -> None: super().__init__() self._text, self._opts, self._session = text, opts, session + if _encoding_from_format(self._opts.encoding) == "mp3": + self._mp3_decoder = utils.codecs.Mp3StreamDecoder() @utils.log_exceptions(logger=logger) async def _main_task(self) -> None: @@ -181,21 +183,39 @@ async def _main_task(self) -> None: headers={AUTHORIZATION_HEADER: self._opts.api_key}, json=data, ) as resp: - async for bytes_data, _ in resp.content.iter_chunks(): - for frame in bstream.write(bytes_data): + if not resp.content_type.startswith("audio/"): + content = await resp.text() + logger.error("11labs returned non-audio data: %s", content) + return + encoding = _encoding_from_format(self._opts.encoding) + if encoding == "mp3": + async for bytes_data, _ in resp.content.iter_chunks(): + for frame in self._mp3_decoder.decode_chunk(bytes_data): + self._event_ch.send_nowait( + tts.SynthesizedAudio( + request_id=request_id, + segment_id=segment_id, + frame=frame, + ) + ) + else: + async for bytes_data, _ in resp.content.iter_chunks(): + for frame in bstream.write(bytes_data): + self._event_ch.send_nowait( + tts.SynthesizedAudio( + request_id=request_id, + segment_id=segment_id, + frame=frame, + ) + ) + + for frame in bstream.flush(): self._event_ch.send_nowait( tts.SynthesizedAudio( request_id=request_id, segment_id=segment_id, frame=frame ) ) - for frame in bstream.flush(): - self._event_ch.send_nowait( - tts.SynthesizedAudio( - request_id=request_id, segment_id=segment_id, frame=frame - ) - ) - class SynthesizeStream(tts.SynthesizeStream): """Streamed API using websockets""" @@ -388,11 +408,11 @@ def _synthesize_url(opts: _TTSOptions) -> str: base_url = opts.base_url voice_id = opts.voice.id model_id = opts.model_id - sample_rate = _sample_rate_from_format(opts.encoding) + output_format = opts.encoding latency = opts.streaming_latency return ( f"{base_url}/text-to-speech/{voice_id}/stream?" - f"model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}" + f"model_id={model_id}&output_format={output_format}&optimize_streaming_latency={latency}" ) From 8a4aed175480f188de00e0674a7f1a5b7ef75684 Mon Sep 17 00:00:00 2001 From: David Zhao Date: Sat, 3 Aug 2024 12:23:50 -0700 Subject: [PATCH 24/24] 0.8 migration guide (#572) --- 0.8-migration-guide.md | 186 +++++++++++++++++++++++++++++++++++++++++ README.md | 13 ++- 2 files changed, 197 insertions(+), 2 deletions(-) create mode 100644 0.8-migration-guide.md diff --git a/0.8-migration-guide.md b/0.8-migration-guide.md new file mode 100644 index 000000000..f00b70a06 --- /dev/null +++ b/0.8-migration-guide.md @@ -0,0 +1,186 @@ +# Migrating to 0.8.x + +v0.8 is a major release of the framework, featuring significant reliability improvements to VoiceAssistant. This update includes a few breaking API changes that will impact the way you build your agents. We strive to minimize breaking changes and will stabilize the API as we approach version 1.0. + +## Job and Worker API + +### Specifying your entrypoint function + +`entrypoint_fnc` is now a parameter in WorkerOptions. Previously, you were required to explicitly accept the job. + +### Namespace had been removed + +We've removed the namespace option in order to simplify the registration process. In future versions, it'll be possible to provide an explicit `agent_name` to spin up multiple kinds of agents for each room. + +### Connecting to room is explicit + +You now need to call ctx.connect() to initiate the connection to the room. This allows for pre-connect setup (such as callback registrations) to avoid race conditions. + +### Example + +The above changes are reflected in the following minimal example: + +```python +from livekit.agents import JobContext, JobRequest, WorkerOptions, cli + +async def job_entrypoint(ctx: JobContext): + await ctx.connect() + # your logic here + ... + +if __name__ == "__main__": + cli.run_app( + WorkerOptions(entrypoint_fnc=job_entrypoint) + ) +``` + +## VoiceAssistant + +VoiceAssistant API remains mostly unchanged, despite significant improvements to functionality and internals. However, there have been changes to the configuration. + +### Initialization args + +- Removed + - base_volume + - debug + - sentence_tokenizer, word_tokenizer, hyphenate_word +- Changed + - transcription related options are grouped within `transcription` param + +```python +class VoiceAssistant(utils.EventEmitter[EventTypes]): + def __init__( + self, + *, + vad: vad.VAD, + stt: stt.STT, + llm: LLM, + tts: tts.TTS, + chat_ctx: ChatContext | None = None, + fnc_ctx: FunctionContext | None = None, + allow_interruptions: bool = True, + interrupt_speech_duration: float = 0.6, + interrupt_min_words: int = 0, + preemptive_synthesis: bool = True, + transcription: AssistantTranscriptionOptions = AssistantTranscriptionOptions(), + will_synthesize_assistant_reply: WillSynthesizeAssistantReply = _default_will_synthesize_assistant_reply, + plotting: bool = False, + loop: asyncio.AbstractEventLoop | None = None, + ) -> None: + ... +``` + +## LLM + +The LLM class has been restructured to enhance ergonomics and improve the function calling support. + +### Function/tool calling + +Function calling has gotten a complete overhaul in v0.8.0. The primary breaking change is that function calls are now NOT automatically invoked when iterating the LLM stream. `LLMStream.execute_functions` needs to be called instead. (VoiceAssistant handles this automatically) + +### LLM.chat is no longer an async method + +Previously, LLM.chat() was an async method that returned an LLMStream (which itself was an AsyncIterable). + +We found it easier and less-confusing for LLM.chat() to be synchronous, while still returning the same AsyncIterable LLMStream. + +### LLM.chat `history` has been renamed to `chat_ctx` + +In order to improve consistency and reduce confusion. + +```python +chat_ctx = llm.ChatContext() +chat_ctx.append(role="user", text="user message") +stream = llm_plugin.chat(chat_ctx=chat_ctx) +``` + +## STT + +### SpeechStream.flush + +Previously, to communicate to a STT provider that you have sent enough input to generate a response - you could push_frame(None) to coax the TTS into synthesizing a response. + +In v0.8.0 that API has been removed and replaced with flush() + +### SpeechStream.end_input + +`end_input` signals to the STT provider that the input is complete and no additional input will follow. Previously, this was done using aclose(wait=True). + +### SpeechStream.aclose + +The `wait` arg of aclose has been removed in favor of SpeechStream.end_input (see above). Now, if you call `TTS.aclose()` without first calling STT.end_input, the behavior will be that the request is cancelled. + +```python +stt_stream = my_stt_instance.stream() +async for ev in audio_stream: + stt_stream.push_frame(ev.frame) + # optionally flush when enough frames have been pushed + stt_stream.flush() + +stt_stream.end_input() +await stt_stream.aclose() +``` + +## TTS + +### SynthesizedAudio changed and SynthesisEvent removed + +SynthesizedAudio dataclass had gone through a major change + +```python +# New SynthesizedAudio dataclass +@dataclass +class SynthesizedAudio: + request_id: str + """Request ID (one segment could be made up of multiple requests)""" + segment_id: str + """Segment ID, each segment is separated by a flush""" + frame: rtc.AudioFrame + """Synthesized audio frame""" + delta_text: str = "" + """Current segment of the synthesized audio""" + +#Old SynthesizedAudio dataclass +@dataclass +class SynthesizedAudio: + text: str + data: rtc.AudioFrame +``` + +The SynthesisEvent has been removed entirely. All occurrences of it have been replaced with SynthesizedAudio + +### SynthesizeStream.flush + +Similar to the STT changes, this coaxes the TTS provider into generating a response. The SynthesizedAudio response will have a new segment_id after calls to flush(). + +### SynthesizeStream.end_input + +Similar to the STT changes, this replaces aclose(wait=True). + +### SynthesizeStream.aclose + +Similar to the STT changes, the wait arg has been removed. + +```python +tts_stream = my_tts_instance.stream() +tts_stream.push_text("This is the first sentence") +tts_stream.flush() +tts_stream.push_text("This is the second sentence") +tts_stream.end_input() +await tts_stream.aclose() +``` + +## VAD + +The same changes made to STT and TTS have also been made to VAD + +```python +vad_stream = my_vad_instance.stream() +async for ev in audio_stream: + vad_stream.push_frame(ev.frame) + # optionally flush when enough frames have been pushed + vad_stream.flush() + +vad_stream.end_input() +await vad_stream.aclose() +``` diff --git a/README.md b/README.md index a497f63c4..dd08f9d8d 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ audio, video, and data streams. The framework includes plugins for common workflows, such as voice activity detection and speech-to-text. -Agents integrates seamlessly with [LiveKit server](https://github.com/livekit/livekit), offloading job queuing and scheduling responsibilities to it. This eliminates the need for additional queuing infrastructure. Agent code developed on your local machine can scale to support thousands of concurrent sessions when deployed to a server in production. +Agents integrates seamlessly with Cloud or self-hosted [LiveKit](https://livekit.io/) server, offloading job queuing and scheduling responsibilities to it. This eliminates the need for additional queuing infrastructure. Agent code developed on your local machine can scale to support thousands of concurrent sessions when deployed to a server in production. > This SDK is currently in Developer Preview. During this period, you may encounter bugs and the APIs may change. > @@ -33,6 +33,9 @@ Agents integrates seamlessly with [LiveKit server](https://github.com/livekit/li - [Working with plugins](https://docs.livekit.io/agents/plugins) - [Deploying agents](https://docs.livekit.io/agents/deployment) +> [!NOTE] +> There are breaking API changes between versions 0.7.x and 0.8.x. Please refer to the [0.8 migration guide](0.8-migration-guide.md) for a detailed overview of the changes. + ## Examples - [Voice assistant](https://github.com/livekit/agents/tree/main/examples/voice-assistant): A voice assistant with STT, LLM, and TTS. [Demo](https://kitt.livekit.io) @@ -81,7 +84,7 @@ The framework exposes a CLI interface to run your agent. To get started, you'll - LIVEKIT_API_KEY - LIVEKIT_API_SECRET -### Running the worker +### Starting the worker This will start the worker and wait for users to connect to your LiveKit server: @@ -89,6 +92,12 @@ This will start the worker and wait for users to connect to your LiveKit server: python my_agent.py start ``` +To run the worker in dev-mode (with hot code reloading), you can use the dev command: + +```bash +python my_agent.py dev +``` + ### Using playground for your agent UI To ease the process of building and testing an agent, we've developed a versatile web frontend called "playground". You can use or modify this app to suit your specific requirements. It can also serve as a starting point for a completely custom agent application.