livekit · TasseDeCafe · Feb 4, 2024 · Feb 4, 2024 · Feb 4, 2024 · Feb 4, 2024
diff --git a/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/tts.py b/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/tts.py
@@ -29,7 +29,7 @@
 
 @dataclass
 class Voice:
-    id: str
+    voice_id: str
     name: str
     category: str
     settings: Optional["VoiceSettings"] = None
@@ -44,7 +44,7 @@ class VoiceSettings:
 
 
 DEFAULT_VOICE = Voice(
-    id="EXAVITQu4vr4xnSDxMaL",
+    voice_id="EXAVITQu4vr4xnSDxMaL",
     name="Bella",
     category="premade",
     settings=VoiceSettings(
@@ -108,7 +108,7 @@ async def synthesize(
     ) -> tts.SynthesizedAudio:
         voice = self._config.voice
         async with self._session.post(
-            f"{self._config.base_url}/text-to-speech/{voice.id}?output_format=pcm_44100",
+            f"{self._config.base_url}/text-to-speech/{voice.voice_id}?output_format=mp3_44100_128",
             headers={AUTHORIZATION_HEADER: self._config.api_key},
             json=dict(
                 text=text,
@@ -159,9 +159,9 @@ def log_exception(task: asyncio.Task) -> None:
 
     def _stream_url(self) -> str:
         base_url = self._config.base_url
-        voice_id = self._config.voice.id
+        voice_id = self._config.voice.voice_id
         model_id = self._config.model_id
-        return f"{base_url}/text-to-speech/{voice_id}/stream-input?model_id={model_id}&output_format=pcm_{self._config.sample_rate}&optimize_streaming_latency={self._config.latency}"
+        return f"{base_url}/text-to-speech/{voice_id}/stream-input?model_id={model_id}&output_format=mp3_44100_128&optimize_streaming_latency={self._config.latency}"
 
     def push_text(self, token: str) -> None:
         if self._closed:
@@ -184,6 +184,7 @@ async def _run(self, max_retry: int) -> None:
         retry_count = 0
         listen_task: Optional[asyncio.Task] = None
         ws: Optional[aiohttp.ClientWebSocketResponse] = None
+        retry_text_queue: asyncio.Queue[str] = asyncio.Queue()
         while True:
             try:
                 ws = await self._try_connect()
@@ -194,7 +195,13 @@ async def _run(self, max_retry: int) -> None:
                 # forward queued text to 11labs
                 started = False
                 while not ws.closed:
-                    text = await self._queue.get()
+                    text = None
+                    if not retry_text_queue.empty():
+                        text = await retry_text_queue.get()
+                        retry_text_queue.task_done()
+                    else:
+                        text = await self._queue.get()
+
                     if not started:
                         self._event_queue.put_nowait(
                             tts.SynthesisEvent(type=tts.SynthesisEventType.STARTED)
@@ -204,7 +211,19 @@ async def _run(self, max_retry: int) -> None:
                         text=text,
                         try_trigger_generation=True,
                     )
-                    await ws.send_str(json.dumps(text_packet))
+
+                    # This case can happen in normal operation because 11labs will not
+                    # keep connections open indefinitely if we are not sending data.
+                    try:
+                        await ws.send_str(json.dumps(text_packet))
+                    except Exception:
+                        await retry_text_queue.put(text)
+                        break
+
+                    # We call self._queue.task_done() even if we are retrying the text because
+                    # all text has gone through self._queue. An exception may have short-circuited
+                    # out of the loop so task_done() will not have already been called on text that
+                    # is being retried.
                     self._queue.task_done()
                     if text == STREAM_EOS:
                         await listen_task
@@ -307,10 +326,10 @@ def dict_to_voices_list(data: dict) -> List[Voice]:
     for voice in data["voices"]:
         voices.append(
             Voice(
-                id=voice["voice_id"],
+                voice_id=voice["voice_id"],
                 name=voice["name"],
                 category=voice["category"],
                 settings=None,
             )
         )
-    return voices
+    return voices