KoljaB · phamson02 · Dec 10, 2024 · Dec 16, 2024
diff --git a/RealtimeTTS/engines/base_engine.py b/RealtimeTTS/engines/base_engine.py
@@ -53,7 +53,7 @@ def get_stream_info(self):
             "The get_stream_info method must be implemented by the derived class."
         )
 
-    def synthesize(self, text: str) -> bool:
+    def synthesize(self, text: str, *args, **kwargs) -> bool:
         """
         Synthesizes text to audio stream.
 

diff --git a/RealtimeTTS/engines/coqui_engine.py b/RealtimeTTS/engines/coqui_engine.py
@@ -1,3 +1,4 @@
+import queue
 from .base_engine import BaseEngine
 import torch.multiprocessing as mp
 from threading import Lock, Thread
@@ -212,8 +213,10 @@ def __init__(
         # Start the worker process
         try:
             # Only set the start method if it hasn't been set already
-                # Check the current platform and set the start method
-            if sys.platform.startswith('linux') or sys.platform == 'darwin':  # For Linux or macOS
+            # Check the current platform and set the start method
+            if (
+                sys.platform.startswith("linux") or sys.platform == "darwin"
+            ):  # For Linux or macOS
                 mp.set_start_method("spawn")
             elif mp.get_start_method(allow_none=True) is None:
                 mp.set_start_method("spawn")
@@ -737,8 +740,15 @@ def get_user_data_dir(appname):
                                 # wait only if we are faster than realtime, meaning
                                 # that chunk_production_seconds is smaller than generated_audio_seconds
                                 if load_balancing:
-                                    if chunk_production_seconds < (generated_audio_seconds + load_balancing_buffer_length):
-                                        waiting_time = generated_audio_seconds - chunk_production_seconds - load_balancing_cut_off
+                                    if chunk_production_seconds < (
+                                        generated_audio_seconds
+                                        + load_balancing_buffer_length
+                                    ):
+                                        waiting_time = (
+                                            generated_audio_seconds
+                                            - chunk_production_seconds
+                                            - load_balancing_cut_off
+                                        )
                                         if waiting_time > 0:
                                             print(f"Waiting for {waiting_time} seconds")
                                             time.sleep(waiting_time)
@@ -759,7 +769,6 @@ def get_user_data_dir(appname):
                             print(f"Realtime Factor: {realtime_factor}")
                             print(f"Raw Inference Factor: {raw_inference_factor}")
 
-
                     # Send silent audio
                     sample_rate = config.audio.sample_rate
 
@@ -796,7 +805,7 @@ def get_user_data_dir(appname):
             print(f"Error: {e}")
 
             conn.send(("error", str(e)))
-    
+
         sys.stdout = sys.__stdout__
         sys.stderr = sys.__stderr__
 
@@ -908,7 +917,7 @@ def _prepare_text_for_synthesis(self, text: str):
                 text = text[:-2]
             elif len(text) > 3 and text[-2] in ["!", "?", ","]:
                 text = text[:-2] + " " + text[-2]
-            
+
         except Exception as e:
             logging.warning(
                 f'Error fixing sentence end punctuation: {e}, Text: "{text}"'
@@ -920,7 +929,7 @@ def _prepare_text_for_synthesis(self, text: str):
 
         return text
 
-    def synthesize(self, text: str) -> bool:
+    def synthesize(self, text: str, audio_queue: queue.Queue) -> bool:
         """
         Synthesizes text to audio stream.
 
@@ -947,7 +956,7 @@ def synthesize(self, text: str) -> bool:
                         logging.error(f"Error: {result}")
                     return False
 
-                self.queue.put(result)
+                audio_queue.put(result)
                 status, result = self.parent_synthesize_pipe.recv()
 
             return True

diff --git a/RealtimeTTS/text_to_stream.py b/RealtimeTTS/text_to_stream.py
@@ -100,6 +100,8 @@ def __init__(
             # Handle the case where engine is a single BaseEngine instance
             self.engines = [engine]
 
+        self.audio_queue = queue.Queue()
+
         self.load_engine(self.engines[self.engine_index])
 
     def load_engine(self, engine: BaseEngine):
@@ -127,7 +129,7 @@ def load_engine(self, engine: BaseEngine):
         )
 
         self.player = StreamPlayer(
-            self.engine.queue, config, on_playback_start=self._on_audio_stream_start
+            self.audio_queue, config, on_playback_start=self._on_audio_stream_start
         )
 
         logging.info(f"loaded engine {self.engine.engine_name}")
@@ -319,7 +321,6 @@ def play(
                 self.engine.synthesize(self.char_iter)
 
             finally:
-
                 try:
                     if self.player:
                         self.player.stop()
@@ -395,7 +396,9 @@ def synthesize_worker():
 
                         synthesis_successful = False
                         if log_synthesized_text:
-                            print(f"\033[96m\033[1m⚡ synthesizing\033[0m \033[37m→ \033[2m'\033[22m{sentence}\033[2m'\033[0m")
+                            print(
+                                f"\033[96m\033[1m⚡ synthesizing\033[0m \033[37m→ \033[2m'\033[22m{sentence}\033[2m'\033[0m"
+                            )
 
                         while not synthesis_successful:
                             try:
@@ -404,7 +407,9 @@ def synthesize_worker():
 
                                 if before_sentence_synthesized:
                                     before_sentence_synthesized(sentence)
-                                success = self.engine.synthesize(sentence)
+                                success = self.engine.synthesize(
+                                    sentence, audio_queue=self.audio_queue
+                                )
                                 if success:
                                     if on_sentence_synthesized:
                                         on_sentence_synthesized(sentence)
@@ -486,10 +491,11 @@ def synthesize_worker():
                         self.wf.close()
                         self.wf = None
 
-            if (len(self.char_iter.items) > 0
+            if (
+                len(self.char_iter.items) > 0
                 and self.char_iter.iterated_text == ""
-                and not self.char_iter.immediate_stop.is_set()):
-
+                and not self.char_iter.immediate_stop.is_set()
+            ):
                 # new text was feeded while playing audio but after the last character was processed
                 # we need to start another play() call (!recursively!)
                 self.play(