diff --git a/RealtimeTTS/engines/base_engine.py b/RealtimeTTS/engines/base_engine.py index 15eca04..72e249b 100644 --- a/RealtimeTTS/engines/base_engine.py +++ b/RealtimeTTS/engines/base_engine.py @@ -53,7 +53,7 @@ def get_stream_info(self): "The get_stream_info method must be implemented by the derived class." ) - def synthesize(self, text: str) -> bool: + def synthesize(self, text: str, *args, **kwargs) -> bool: """ Synthesizes text to audio stream. diff --git a/RealtimeTTS/engines/coqui_engine.py b/RealtimeTTS/engines/coqui_engine.py index e07dc53..3a2017c 100644 --- a/RealtimeTTS/engines/coqui_engine.py +++ b/RealtimeTTS/engines/coqui_engine.py @@ -1,3 +1,4 @@ +import queue from .base_engine import BaseEngine import torch.multiprocessing as mp from threading import Lock, Thread @@ -212,8 +213,10 @@ def __init__( # Start the worker process try: # Only set the start method if it hasn't been set already - # Check the current platform and set the start method - if sys.platform.startswith('linux') or sys.platform == 'darwin': # For Linux or macOS + # Check the current platform and set the start method + if ( + sys.platform.startswith("linux") or sys.platform == "darwin" + ): # For Linux or macOS mp.set_start_method("spawn") elif mp.get_start_method(allow_none=True) is None: mp.set_start_method("spawn") @@ -737,8 +740,15 @@ def get_user_data_dir(appname): # wait only if we are faster than realtime, meaning # that chunk_production_seconds is smaller than generated_audio_seconds if load_balancing: - if chunk_production_seconds < (generated_audio_seconds + load_balancing_buffer_length): - waiting_time = generated_audio_seconds - chunk_production_seconds - load_balancing_cut_off + if chunk_production_seconds < ( + generated_audio_seconds + + load_balancing_buffer_length + ): + waiting_time = ( + generated_audio_seconds + - chunk_production_seconds + - load_balancing_cut_off + ) if waiting_time > 0: print(f"Waiting for {waiting_time} seconds") time.sleep(waiting_time) @@ -759,7 +769,6 @@ def get_user_data_dir(appname): print(f"Realtime Factor: {realtime_factor}") print(f"Raw Inference Factor: {raw_inference_factor}") - # Send silent audio sample_rate = config.audio.sample_rate @@ -796,7 +805,7 @@ def get_user_data_dir(appname): print(f"Error: {e}") conn.send(("error", str(e))) - + sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ @@ -908,7 +917,7 @@ def _prepare_text_for_synthesis(self, text: str): text = text[:-2] elif len(text) > 3 and text[-2] in ["!", "?", ","]: text = text[:-2] + " " + text[-2] - + except Exception as e: logging.warning( f'Error fixing sentence end punctuation: {e}, Text: "{text}"' @@ -920,7 +929,7 @@ def _prepare_text_for_synthesis(self, text: str): return text - def synthesize(self, text: str) -> bool: + def synthesize(self, text: str, audio_queue: queue.Queue) -> bool: """ Synthesizes text to audio stream. @@ -947,7 +956,7 @@ def synthesize(self, text: str) -> bool: logging.error(f"Error: {result}") return False - self.queue.put(result) + audio_queue.put(result) status, result = self.parent_synthesize_pipe.recv() return True diff --git a/RealtimeTTS/text_to_stream.py b/RealtimeTTS/text_to_stream.py index 8f532cf..31cce8a 100644 --- a/RealtimeTTS/text_to_stream.py +++ b/RealtimeTTS/text_to_stream.py @@ -100,6 +100,8 @@ def __init__( # Handle the case where engine is a single BaseEngine instance self.engines = [engine] + self.audio_queue = queue.Queue() + self.load_engine(self.engines[self.engine_index]) def load_engine(self, engine: BaseEngine): @@ -127,7 +129,7 @@ def load_engine(self, engine: BaseEngine): ) self.player = StreamPlayer( - self.engine.queue, config, on_playback_start=self._on_audio_stream_start + self.audio_queue, config, on_playback_start=self._on_audio_stream_start ) logging.info(f"loaded engine {self.engine.engine_name}") @@ -319,7 +321,6 @@ def play( self.engine.synthesize(self.char_iter) finally: - try: if self.player: self.player.stop() @@ -395,7 +396,9 @@ def synthesize_worker(): synthesis_successful = False if log_synthesized_text: - print(f"\033[96m\033[1m⚡ synthesizing\033[0m \033[37m→ \033[2m'\033[22m{sentence}\033[2m'\033[0m") + print( + f"\033[96m\033[1m⚡ synthesizing\033[0m \033[37m→ \033[2m'\033[22m{sentence}\033[2m'\033[0m" + ) while not synthesis_successful: try: @@ -404,7 +407,9 @@ def synthesize_worker(): if before_sentence_synthesized: before_sentence_synthesized(sentence) - success = self.engine.synthesize(sentence) + success = self.engine.synthesize( + sentence, audio_queue=self.audio_queue + ) if success: if on_sentence_synthesized: on_sentence_synthesized(sentence) @@ -486,10 +491,11 @@ def synthesize_worker(): self.wf.close() self.wf = None - if (len(self.char_iter.items) > 0 + if ( + len(self.char_iter.items) > 0 and self.char_iter.iterated_text == "" - and not self.char_iter.immediate_stop.is_set()): - + and not self.char_iter.immediate_stop.is_set() + ): # new text was feeded while playing audio but after the last character was processed # we need to start another play() call (!recursively!) self.play( diff --git a/example_fast_api/async_server.py b/example_fast_api/async_server.py index 494aef2..d49c574 100644 --- a/example_fast_api/async_server.py +++ b/example_fast_api/async_server.py @@ -40,12 +40,12 @@ "openai", "elevenlabs", "system", - # "coqui", #multiple queries are not supported on coqui engine right now, comment coqui out for tests where you need server start often + "coqui", # multiple queries are not supported on coqui engine right now, comment coqui out for tests where you need server start often ] # change start engine by moving engine name # to the first position in SUPPORTED_ENGINES -START_ENGINE = SUPPORTED_ENGINES[0] +START_ENGINE = SUPPORTED_ENGINES[-1] BROWSER_IDENTIFIERS = [ "mozilla", diff --git a/example_fast_api/client_multi_queries_test.py b/example_fast_api/client_multi_queries_test.py new file mode 100644 index 0000000..5175852 --- /dev/null +++ b/example_fast_api/client_multi_queries_test.py @@ -0,0 +1,149 @@ +import uuid +import wave +import requests +import pyaudio +import time +import threading +import argparse +from queue import Queue + +# Argument parser setup +parser = argparse.ArgumentParser(description="Run the TTS client.") +parser.add_argument( + "-p", + "--port", + type=int, + default=8000, + help="Port of the TTS server (default: 8000)", +) +parser.add_argument( + "-t", + "--text", + type=str, + default="Hello! This is a default text to speech demo text!", + help="Text to convert to speech (default: 'Hello! This is a default text-to-speech demonstration. Enjoy the experience!')", +) +parser.add_argument( + "-w", "--write", action="store_true", help="Save output to a WAV file" +) +args = parser.parse_args() + +port = args.port +text_to_tts = args.text +write_to_file = args.write + +# Configuration + +SERVER_URL = f"http://127.0.0.1:{port}/tts" +AUDIO_FORMAT = pyaudio.paInt16 +CHANNELS = 1 +RATE = 16000 # coqui (24000), azure (16000), openai (22050), system (22050) + +# Initialize PyAudio +pyaudio_instance = pyaudio.PyAudio() + +# Queue for chunk storage +chunk_queue = Queue() + + +# Thread function for playing audio +def play_audio(): + global start_time + buffer = b"" + played_out = False + got_first_chunk = False + + frame_size = pyaudio_instance.get_sample_size(AUDIO_FORMAT) * CHANNELS + min_buffer_size = 1024 * 6 # Adjust as needed + + # Initial buffering + while len(buffer) < min_buffer_size: + chunk = chunk_queue.get() + if chunk is None: + break + if not got_first_chunk: + got_first_chunk = True + time_to_first_token = time.time() - start_time + print(f"Time to first token: {time_to_first_token}") + buffer += chunk + + # Now start playback + while True: + # Write data if buffer has enough frames + if len(buffer) >= frame_size: + num_frames = len(buffer) // frame_size + bytes_to_write = num_frames * frame_size + if not played_out: + played_out = True + time_to_first_token = time.time() - start_time + # print(f"Time to first playout: {time_to_first_token}") + # stream.write(buffer[:bytes_to_write]) + buffer = buffer[bytes_to_write:] + else: + # Get more data + chunk = chunk_queue.get() + if chunk is None: + # Write any remaining data + if len(buffer) > 0: + # Truncate buffer to multiple of frame size if necessary + if len(buffer) % frame_size != 0: + buffer = buffer[: -(len(buffer) % frame_size)] + + if not played_out: + played_out = True + time_to_first_token = time.time() - start_time + # print(f"Time to first playout: {time_to_first_token}") + # stream.write(buffer) + break + buffer += chunk + + +# Function to request text-to-speech conversion and retrieve chunks +def request_tts(text): + # Optionally set up WAV file + if write_to_file: + output_wav_file = f"output_audio_{uuid.uuid4()}.wav" + wav_file = wave.open(output_wav_file, "wb") + wav_file.setnchannels(CHANNELS) + wav_file.setsampwidth(pyaudio_instance.get_sample_size(AUDIO_FORMAT)) + wav_file.setframerate(RATE) + + global start_time + start_time = time.time() + try: + response = requests.get( + SERVER_URL, params={"text": text}, stream=True, timeout=10 + ) + response.raise_for_status() # Raises an HTTPError if the response status is 4xx/5xx + + # Read data as it becomes available + for chunk in response.iter_content(chunk_size=None): + if chunk and write_to_file: + wav_file.writeframes(chunk) + chunk_queue.put(chunk) + + # Signal the end of the stream + chunk_queue.put(None) + + except requests.exceptions.RequestException as e: + print(f"Error occurred: {e}") + chunk_queue.put(None) # Ensure that playback thread exits gracefully + + +# Start audio playback thread +playback_thread = threading.Thread(target=play_audio) +playback_thread.start() + +request1_thread = threading.Thread(target=request_tts, args=("One, " + text_to_tts,)) +request2_thread = threading.Thread(target=request_tts, args=("Two, " + text_to_tts,)) + +# Retrieve and queue chunks in the main thread +try: + request1_thread.start() + request2_thread.start() + +finally: + request1_thread.join() + request2_thread.join() + playback_thread.join() + pyaudio_instance.terminate()