From c35f9e54b93e6df318f0a01cb797a4e1d7be54d3 Mon Sep 17 00:00:00 2001 From: Ryan Tremblay Date: Thu, 1 Jun 2023 11:12:09 -0700 Subject: [PATCH] Turn Based Coqui and Agent Improvements (#172) * Add async synthesize, xtts, and prompt to coqui TB * add speechrecognition and aiohttp dependencies * add optional memory arg to turn-based ChatGPTAgent * fix mypy issue * pr feedback * fix py3.8 typing issue * another py3.8 fix --- poetry.lock | 20 ++- pyproject.toml | 2 + vocode/turn_based/agent/chat_gpt_agent.py | 5 +- .../synthesizer/base_synthesizer.py | 2 + .../synthesizer/coqui_synthesizer.py | 154 ++++++++++++++++-- 5 files changed, 170 insertions(+), 13 deletions(-) diff --git a/poetry.lock b/poetry.lock index 1bf6d04ff..80f19fe9e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2491,6 +2491,24 @@ CFFI = ">=1.0" [package.extras] numpy = ["NumPy"] +[[package]] +name = "speechrecognition" +version = "3.10.0" +description = "Library for performing speech recognition, with support for several engines and APIs, online and offline." +category = "main" +optional = false +python-versions = ">=3.8" +files = [ + {file = "SpeechRecognition-3.10.0-py2.py3-none-any.whl", hash = "sha256:7ae9966887d9909ce3e5a0c27ecc3eacfca16fd0c0829f77f552919418e86306"}, + {file = "SpeechRecognition-3.10.0.tar.gz", hash = "sha256:14131155e8a8ba00ead1b7b9b1a2fa71c845e4db5f9a5f66a33a1bd6c55c6c35"}, +] + +[package.dependencies] +requests = ">=2.26.0" + +[package.extras] +whisper-api = ["openai"] + [[package]] name = "sqlalchemy" version = "2.0.15" @@ -3197,4 +3215,4 @@ transcribers = ["google-cloud-speech"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.12" -content-hash = "a0ca33d55bc2f9f3f4f628c1e5ee66d87dc8f532a67a39fa1e2319080128664c" +content-hash = "ccd129e9b3a65b813feac944f0f1eaae86a63c99f556d52291367e90a57ff57c" diff --git a/pyproject.toml b/pyproject.toml index 309b68dd5..d862dfc58 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,8 @@ google-cloud-speech = {version = "^2.19.0", optional = true} redis = {version = "^4.5.4", optional = true} twilio = {version = "^8.1.0", optional = true} nylas = {version = "^5.14.0", optional = true} +speechrecognition = "^3.10.0" +aiohttp = "^3.8.4" [tool.poetry.group.lint.dependencies] diff --git a/vocode/turn_based/agent/chat_gpt_agent.py b/vocode/turn_based/agent/chat_gpt_agent.py index ae677c7c7..e81d9e168 100644 --- a/vocode/turn_based/agent/chat_gpt_agent.py +++ b/vocode/turn_based/agent/chat_gpt_agent.py @@ -23,6 +23,7 @@ def __init__( model_name: str = "gpt-3.5-turbo", temperature: float = 0.7, max_tokens: int = 100, + memory: Optional[ConversationBufferMemory] = None, ): super().__init__(initial_message=initial_message) openai.api_key = getenv("OPENAI_API_KEY", api_key) @@ -35,7 +36,7 @@ def __init__( HumanMessagePromptTemplate.from_template("{input}"), ] ) - self.memory = ConversationBufferMemory(return_messages=True) + self.memory = memory if memory else ConversationBufferMemory(return_messages=True) if initial_message: self.memory.chat_memory.add_ai_message(initial_message) self.llm = ChatOpenAI( # type: ignore @@ -48,4 +49,4 @@ def __init__( ) def respond(self, human_input: str): - return self.conversation.predict(input=human_input) + return self.conversation.predict(input=human_input) \ No newline at end of file diff --git a/vocode/turn_based/synthesizer/base_synthesizer.py b/vocode/turn_based/synthesizer/base_synthesizer.py index 41f290d3c..1646ce255 100644 --- a/vocode/turn_based/synthesizer/base_synthesizer.py +++ b/vocode/turn_based/synthesizer/base_synthesizer.py @@ -4,3 +4,5 @@ class BaseSynthesizer: def synthesize(self, text) -> AudioSegment: raise NotImplementedError + async def async_synthesize(self, text) -> AudioSegment: + raise NotImplementedError diff --git a/vocode/turn_based/synthesizer/coqui_synthesizer.py b/vocode/turn_based/synthesizer/coqui_synthesizer.py index e0f1f4e28..2eea0a09b 100644 --- a/vocode/turn_based/synthesizer/coqui_synthesizer.py +++ b/vocode/turn_based/synthesizer/coqui_synthesizer.py @@ -1,29 +1,163 @@ import io -from typing import Optional +import re +import typing +from typing import Optional, List from pydub import AudioSegment import requests from vocode import getenv from vocode.turn_based.synthesizer.base_synthesizer import BaseSynthesizer +import aiohttp +import asyncio -COQUI_BASE_URL = "https://app.coqui.ai/api/v2/" +COQUI_BASE_URL = "https://app.coqui.ai/api/v2/samples" DEFAULT_SPEAKER_ID = "d2bd7ccb-1b65-4005-9578-32c4e02d8ddf" +MAX_TEXT_LENGTH = 250 # The maximum length of text that can be synthesized at once class CoquiSynthesizer(BaseSynthesizer): - def __init__(self, voice_id: Optional[str] = None, api_key: Optional[str] = None): + def __init__( + self, + voice_id: Optional[str] = None, + voice_prompt: Optional[str] = None, + use_xtts: bool = False, + api_key: Optional[str] = None, + ): self.voice_id = voice_id or DEFAULT_SPEAKER_ID + self.voice_prompt = voice_prompt + self.use_xtts = use_xtts self.api_key = getenv("COQUI_API_KEY", api_key) def synthesize(self, text: str) -> AudioSegment: - url = COQUI_BASE_URL + "samples" - headers = {"Authorization": f"Bearer {self.api_key}"} - body = { - "text": text, - "speaker_id": self.voice_id, - "name": "unnamed", - } + text_chunks = self.split_text(text) + audio_chunks = [self.synthesize_chunk(chunk) for chunk in text_chunks] + return sum(audio_chunks) # type: ignore + + def synthesize_chunk(self, text: str) -> AudioSegment: + url, headers, body = self.get_request(text) + + # Get the sample response = requests.post(url, headers=headers, json=body) assert response.ok, response.text sample = response.json() response = requests.get(sample["audio_url"]) return AudioSegment.from_wav(io.BytesIO(response.content)) # type: ignore + + + def split_text(self, string): + # Base case: if the string is less than or equal to MAX_TEXT_LENGTH characters, return it as a single element array + if len(string) <= MAX_TEXT_LENGTH: + return [string.strip()] + + # Recursive case: find the index of the last sentence ender in the first MAX_TEXT_LENGTH characters of the string + sentence_enders = [".", "!", "?"] + index = -1 + for ender in sentence_enders: + i = string[:MAX_TEXT_LENGTH].rfind(ender) + if i > index: + index = i + + # If there is a sentence ender, split the string at that index plus one and strip any spaces from both parts + if index != -1: + first_part = string[:index + 1].strip() + second_part = string[index + 1:].strip() + + # If there is no sentence ender, find the index of the last comma in the first MAX_TEXT_LENGTH characters of the string + else: + index = string[:MAX_TEXT_LENGTH].rfind(",") + # If there is a comma, split the string at that index plus one and strip any spaces from both parts + if index != -1: + first_part = string[:index + 1].strip() + second_part = string[index + 1:].strip() + + # If there is no comma, find the index of the last space in the first MAX_TEXT_LENGTH characters of the string + else: + index = string[:MAX_TEXT_LENGTH].rfind(" ") + # If there is a space, split the string at that index and strip any spaces from both parts + if index != -1: + first_part = string[:index].strip() + second_part = string[index:].strip() + + # If there is no space, split the string at MAX_TEXT_LENGTH characters and strip any spaces from both parts + else: + first_part = string[:MAX_TEXT_LENGTH].strip() + second_part = string[MAX_TEXT_LENGTH:].strip() + + # Append the first part to the result array + result = [first_part] + + # Call the function recursively on the remaining part of the string and extend the result array with it, unless it is empty + if second_part != "": + result.extend(self.split_text(second_part)) + + # Return the result array + return result + + + + + async def async_synthesize(self, text: str) -> AudioSegment: + # This method is similar to the synthesize method, but it uses async IO to synthesize each chunk in parallel + + # Split the text into chunks of less than MAX_TEXT_LENGTH characters + text_chunks = self.split_text(text) + + # Create a list of tasks for each chunk using asyncio.create_task() + tasks = [ + asyncio.create_task(self.async_synthesize_chunk(chunk)) + for chunk in text_chunks + ] + + # Wait for all tasks to complete using asyncio.gather() + audio_chunks = await asyncio.gather(*tasks) + + # Concatenate and return the results + return sum(audio_chunks) # type: ignore + + async def async_synthesize_chunk(self, text: str) -> AudioSegment: + url, headers, body = self.get_request(text) + + # Create an aiohttp session and post the request asynchronously using await + async with aiohttp.ClientSession() as session: + async with session.post(url, headers=headers, json=body) as response: + assert response.status == 201, ( + await response.text() + url + str(headers) + str(body) + ) + sample = await response.json() + audio_url = sample["audio_url"] + + # Get the audio data asynchronously using await + async with session.get(audio_url) as response: + assert response.status == 200, "Coqui audio download failed" + audio_data = await response.read() + + # Return an AudioSegment object from the audio data + return AudioSegment.from_wav(io.BytesIO(audio_data)) # type: ignore + + def get_request(self, text: str) -> typing.Tuple[str, typing.Dict[str, str], typing.Dict[str, object]]: + url = COQUI_BASE_URL + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + "Accept": "application/json", + } + body = { + "text": text, + "speed": 1, + } + + if self.use_xtts: + url += "/xtts/" + # If we have a voice prompt, use that instead of the voice ID + if self.voice_prompt is not None: + url += "render-from-prompt/" + body["prompt"] = self.voice_prompt + else: + url += "render/" + body["voice_id"] = self.voice_id + else: + if self.voice_prompt is not None: + url += "/from-prompt/" + body["prompt"] = self.voice_prompt + else: + body["voice_id"] = self.voice_id + return url, headers, body