Merge pull request #456 from gilcu3/tts-support

tts support (WIP)
n3d1117 · Nov 18, 2023 · 8fd24a9 · 8fd24a9
2 parents 2a0d1aa + c3ec1af
commit 8fd24a9
Show file tree

Hide file tree

Showing 7 changed files with 224 additions and 3 deletions.
diff --git a/.env.example b/.env.example
@@ -19,6 +19,7 @@ ALLOWED_TELEGRAM_USER_IDS=USER_ID_1,USER_ID_2
 # TRANSCRIPTION_PRICE=0.006
 # ENABLE_QUOTING=true
 # ENABLE_IMAGE_GENERATION=true
+# ENABLE_TTS_GENERATION=true
 # ENABLE_TRANSCRIPTION=true
 # PROXY=http://localhost:8080
 # OPENAI_MODEL=gpt-3.5-turbo
@@ -42,4 +43,6 @@ ALLOWED_TELEGRAM_USER_IDS=USER_ID_1,USER_ID_2
 # IMAGE_FORMAT=document
 # GROUP_TRIGGER_KEYWORD=""
 # IGNORE_GROUP_TRANSCRIPTIONS=true
+# TTS_MODEL="tts-1"
+# TTS_VOICE="alloy"
 # BOT_LANGUAGE=en
diff --git a/README.md b/README.md
@@ -83,6 +83,7 @@ Check out the [Budget Manual](https://github.com/n3d1117/chatgpt-telegram-bot/di
 | `ENABLE_QUOTING`                   | Whether to enable message quoting in private chats                                                                                                                                                                                                                    | `true`                              |
 | `ENABLE_IMAGE_GENERATION`          | Whether to enable image generation via the `/image` command                                                                                                                                                                                                           | `true`                              |
 | `ENABLE_TRANSCRIPTION`             | Whether to enable transcriptions of audio and video messages                                                                                                                                                                                                          | `true`                              |
+| `ENABLE_TTS_GENERATION`             | Whether to enable text to speech generation via the `/tts`                                                                                                                                                                                                              | `true`                              |
 | `PROXY`                            | Proxy to be used for OpenAI and Telegram bot (e.g. `http://localhost:8080`)                                                                                                                                                                                           | -                                   |
 | `OPENAI_MODEL`                     | The OpenAI model to use for generating responses. You can find all available models [here](https://platform.openai.com/docs/models/)                                                                                                                                  | `gpt-3.5-turbo`                     |
 | `OPENAI_API_BASE`                  | Endpoint URL for unofficial OpenAI-compatible APIs (e.g., LocalAI or text-generation-webui)                                                                                                                                                                           | Default OpenAI API URL              |
@@ -107,6 +108,8 @@ Check out the [Budget Manual](https://github.com/n3d1117/chatgpt-telegram-bot/di
 | `IGNORE_GROUP_TRANSCRIPTIONS`      | If set to true, the bot will not process transcriptions in group chats                                                                                                                                                                                                | `true`                              |
 | `BOT_LANGUAGE`                     | Language of general bot messages. Currently available: `en`, `de`, `ru`, `tr`, `it`, `fi`, `es`, `id`, `nl`, `zh-cn`, `zh-tw`, `vi`, `fa`, `pt-br`, `uk`, `ms`, `uz`.  [Contribute with additional translations](https://github.com/n3d1117/chatgpt-telegram-bot/discussions/219) | `en`                                |
 | `WHISPER_PROMPT`                     | To improve the accuracy of Whisper's transcription service, especially for specific names or terms, you can set up a custom message.  [Speech to text - Prompting](https://platform.openai.com/docs/guides/speech-to-text/prompting) | `-`                                |
+| `TTS_VOICE`                       | The Text to Speech voice to use. Allowed values: `alloy`, `echo`, `fable`, `onyx`, `nova`, or `shimmer`                                                                                                                                                                                  | `alloy`                           |
+| `TTS_MODEL`                       | The Text to Speech model to use. Allowed values: `tts-1` or `tts-1-hd`                                                                                                                                                                                  | `tts-1`                           |
 
 Check out the [official API reference](https://platform.openai.com/docs/api-reference/chat) for more details.
 

diff --git a/bot/main.py b/bot/main.py
@@ -53,6 +53,8 @@ def main():
         'bot_language': os.environ.get('BOT_LANGUAGE', 'en'),
         'show_plugins_used': os.environ.get('SHOW_PLUGINS_USED', 'false').lower() == 'true',
         'whisper_prompt': os.environ.get('WHISPER_PROMPT', ''),
+        'tts_model': os.environ.get('TTS_MODEL', 'tts-1'),
+        'tts_voice': os.environ.get('TTS_VOICE', 'alloy'),
     }
 
     if openai_config['enable_functions'] and not functions_available:
@@ -73,6 +75,7 @@ def main():
         'enable_quoting': os.environ.get('ENABLE_QUOTING', 'true').lower() == 'true',
         'enable_image_generation': os.environ.get('ENABLE_IMAGE_GENERATION', 'true').lower() == 'true',
         'enable_transcription': os.environ.get('ENABLE_TRANSCRIPTION', 'true').lower() == 'true',
+        'enable_tts_generation': os.environ.get('ENABLE_TTS_GENERATION', 'true').lower() == 'true',
         'budget_period': os.environ.get('BUDGET_PERIOD', 'monthly').lower(),
         'user_budgets': os.environ.get('USER_BUDGETS', os.environ.get('MONTHLY_USER_BUDGETS', '*')),
         'guest_budget': float(os.environ.get('GUEST_BUDGET', os.environ.get('MONTHLY_GUEST_BUDGET', '100.0'))),
@@ -85,6 +88,8 @@ def main():
         'token_price': float(os.environ.get('TOKEN_PRICE', 0.002)),
         'image_prices': [float(i) for i in os.environ.get('IMAGE_PRICES', "0.016,0.018,0.02").split(",")],
         'image_receive_mode': os.environ.get('IMAGE_FORMAT', "photo"),
+        'tts_model': os.environ.get('TTS_MODEL', 'tts-1'),
+        'tts_prices': [float(i) for i in os.environ.get('TTS_PRICES', "0.015,0.030").split(",")],
         'transcription_price': float(os.environ.get('TRANSCRIPTION_PRICE', 0.006)),
         'bot_language': os.environ.get('BOT_LANGUAGE', 'en'),
     }

diff --git a/bot/openai_helper.py b/bot/openai_helper.py
@@ -10,6 +10,7 @@
 import requests
 import json
 import httpx
+import io
 from datetime import date
 from calendar import monthrange
 
@@ -342,6 +343,28 @@ async def generate_image(self, prompt: str) -> tuple[str, str]:
         except Exception as e:
             raise Exception(f"⚠️ _{localized_text('error', bot_language)}._ ⚠️\n{str(e)}") from e
 
+    async def generate_speech(self, text: str) -> tuple[any, int]:
+        """
+        Generates an audio from the given text using TTS model.
+        :param prompt: The text to send to the model
+        :return: The audio in bytes and the text size
+        """
+        bot_language = self.config['bot_language']
+        try:
+            response = await self.client.audio.speech.create(
+                model=self.config['tts_model'],
+                voice=self.config['tts_voice'],
+                input=text,
+                response_format='opus'
+            )
+
+            temp_file = io.BytesIO()
+            temp_file.write(response.read())
+            temp_file.seek(0)
+            return temp_file, len(text)
+        except Exception as e:
+            raise Exception(f"⚠️ _{localized_text('error', bot_language)}._ ⚠️\n{str(e)}") from e
+
     async def transcribe(self, filename):
         """
         Transcribes the audio file using the Whisper model.

diff --git a/bot/telegram_bot.py b/bot/telegram_bot.py
@@ -46,6 +46,9 @@ def __init__(self, config: dict, openai: OpenAIHelper):
         if self.config.get('enable_image_generation', False):
             self.commands.append(BotCommand(command='image', description=localized_text('image_description', bot_language)))
 
+        if self.config.get('enable_tts_generation', False):
+            self.commands.append(BotCommand(command='tts', description=localized_text('tts_description', bot_language)))
+
         self.group_commands = [BotCommand(
             command='chat', description=localized_text('chat_description', bot_language)
         )] + self.commands
@@ -94,6 +97,7 @@ async def stats(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
         images_today, images_month = self.usage[user_id].get_current_image_count()
         (transcribe_minutes_today, transcribe_seconds_today, transcribe_minutes_month,
          transcribe_seconds_month) = self.usage[user_id].get_current_transcription_duration()
+        characters_today, characters_month = self.usage[user_id].get_current_tts_usage()
         current_cost = self.usage[user_id].get_current_cost()
 
         chat_id = update.effective_chat.id
@@ -112,11 +116,16 @@ async def stats(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
         text_today_images = ""
         if self.config.get('enable_image_generation', False):
             text_today_images = f"{images_today} {localized_text('stats_images', bot_language)}\n"
+
+        text_today_tts = ""
+        if self.config.get('enable_tts_generation', False):
+            text_today_tts = f"{characters_today} {localized_text('stats_tts', bot_language)}\n"
 
         text_today = (
             f"*{localized_text('usage_today', bot_language)}:*\n"
             f"{tokens_today} {localized_text('stats_tokens', bot_language)}\n"
             f"{text_today_images}"  # Include the image statistics for today if applicable
+            f"{text_today_tts}"
             f"{transcribe_minutes_today} {localized_text('stats_transcribe', bot_language)[0]} "
             f"{transcribe_seconds_today} {localized_text('stats_transcribe', bot_language)[1]}\n"
             f"{localized_text('stats_total', bot_language)}{current_cost['cost_today']:.2f}\n"
@@ -126,12 +135,17 @@ async def stats(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
         text_month_images = ""
         if self.config.get('enable_image_generation', False):
             text_month_images = f"{images_month} {localized_text('stats_images', bot_language)}\n"
+
+        text_month_tts = ""
+        if self.config.get('enable_tts_generation', False):
+            text_month_tts = f"{characters_month} {localized_text('stats_tts', bot_language)}\n"
 
         # Check if image generation is enabled and, if so, generate the image statistics for the month
         text_month = (
             f"*{localized_text('usage_month', bot_language)}:*\n"
             f"{tokens_month} {localized_text('stats_tokens', bot_language)}\n"
             f"{text_month_images}"  # Include the image statistics for the month if applicable
+            f"{text_month_tts}"
             f"{transcribe_minutes_month} {localized_text('stats_transcribe', bot_language)[0]} "
             f"{transcribe_seconds_month} {localized_text('stats_transcribe', bot_language)[1]}\n"
             f"{localized_text('stats_total', bot_language)}{current_cost['cost_month']:.2f}"
@@ -258,6 +272,52 @@ async def _generate():
 
         await wrap_with_indicator(update, context, _generate, constants.ChatAction.UPLOAD_PHOTO)
 
+    async def tts(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
+        """
+        Generates an speech for the given input using TTS APIs
+        """
+        if not self.config['enable_tts_generation'] \
+                or not await self.check_allowed_and_within_budget(update, context):
+            return
+
+        tts_query = message_text(update.message)
+        if tts_query == '':
+            await update.effective_message.reply_text(
+                message_thread_id=get_thread_id(update),
+                text=localized_text('tts_no_prompt', self.config['bot_language'])
+            )
+            return
+
+        logging.info(f'New speech generation request received from user {update.message.from_user.name} '
+                     f'(id: {update.message.from_user.id})')
+
+        async def _generate():
+            try:
+                speech_file, text_length = await self.openai.generate_speech(text=tts_query)
+
+                await update.effective_message.reply_voice(
+                    reply_to_message_id=get_reply_to_message_id(self.config, update),
+                    voice=speech_file
+                )
+                speech_file.close()
+                # add image request to users usage tracker
+                user_id = update.message.from_user.id
+                self.usage[user_id].add_tts_request(text_length, self.config['tts_model'], self.config['tts_prices'])
+                # add guest chat request to guest usage tracker
+                if str(user_id) not in self.config['allowed_user_ids'].split(',') and 'guests' in self.usage:
+                    self.usage["guests"].add_tts_request(text_length, self.config['tts_model'], self.config['tts_prices'])
+
+            except Exception as e:
+                logging.exception(e)
+                await update.effective_message.reply_text(
+                    message_thread_id=get_thread_id(update),
+                    reply_to_message_id=get_reply_to_message_id(self.config, update),
+                    text=f"{localized_text('tts_fail', self.config['bot_language'])}: {str(e)}",
+                    parse_mode=constants.ParseMode.MARKDOWN
+                )
+
+        await wrap_with_indicator(update, context, _generate, constants.ChatAction.UPLOAD_VOICE)
+
     async def transcribe(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
         """
         Transcribe audio messages.
@@ -794,6 +854,7 @@ def run(self):
         application.add_handler(CommandHandler('reset', self.reset))
         application.add_handler(CommandHandler('help', self.help))
         application.add_handler(CommandHandler('image', self.image))
+        application.add_handler(CommandHandler('tts', self.tts))
         application.add_handler(CommandHandler('start', self.help))
         application.add_handler(CommandHandler('stats', self.stats))
         application.add_handler(CommandHandler('resend', self.resend))

diff --git a/bot/usage_tracker.py b/bot/usage_tracker.py
@@ -56,14 +56,16 @@ def __init__(self, user_id, user_name, logs_dir="usage_logs"):
         if os.path.isfile(self.user_file):
             with open(self.user_file, "r") as file:
                 self.usage = json.load(file)
+            if 'tts_characters' not in self.usage['usage_history']:
+                self.usage['usage_history']['tts_characters'] = {}
         else:
             # ensure directory exists
             pathlib.Path(logs_dir).mkdir(exist_ok=True)
             # create new dictionary for this user
             self.usage = {
                 "user_name": user_name,
                 "current_cost": {"day": 0.0, "month": 0.0, "all_time": 0.0, "last_update": str(date.today())},
-                "usage_history": {"chat_tokens": {}, "transcription_seconds": {}, "number_images": {}}
+                "usage_history": {"chat_tokens": {}, "transcription_seconds": {}, "number_images": {}, "tts_characters": {}}
             }
 
     # token usage functions:
@@ -151,6 +153,57 @@ def get_current_image_count(self):
                 usage_month += sum(images)
         return usage_day, usage_month
 
+    # tts usage functions:
+
+    def add_tts_request(self, text_length, tts_model, tts_prices):
+        tts_models = ['tts-1', 'tts-1-hd']
+        price = tts_prices[tts_models.index(tts_model)]
+        today = date.today()
+        tts_price = round(text_length * price / 1000, 2)
+        self.add_current_costs(tts_price)
+
+        if 'tts_characters' not in self.usage['usage_history']:
+            self.usage['usage_history']['tts_characters'] = {}
+
+        if tts_model not in self.usage['usage_history']['tts_characters']:
+            self.usage['usage_history']['tts_characters'][tts_model] = {}
+
+        # update usage_history
+        if str(today) in self.usage["usage_history"]["tts_characters"][tts_model]:
+            # add requested text length to existing date
+            self.usage["usage_history"]["tts_characters"][tts_model][str(today)] += text_length
+        else:
+            # create new entry for current date
+            self.usage["usage_history"]["tts_characters"][tts_model][str(today)] = text_length
+
+        # write updated token usage to user file
+        with open(self.user_file, "w") as outfile:
+            json.dump(self.usage, outfile)
+
+    def get_current_tts_usage(self):
+        """Get length of speech generated for today and this month.
+
+        :return: total amount of characters converted to speech per day and per month
+        """
+
+        tts_models = ['tts-1', 'tts-1-hd']
+        today = date.today()
+        characters_day = 0
+        for tts_model in tts_models:
+            if tts_model in self.usage["usage_history"]["tts_characters"] and \
+                str(today) in self.usage["usage_history"]["tts_characters"][tts_model]:
+                characters_day += self.usage["usage_history"]["tts_characters"][tts_model][str(today)]
+
+        month = str(today)[:7]  # year-month as string
+        characters_month = 0
+        for tts_model in tts_models:
+            if tts_model in self.usage["usage_history"]["tts_characters"]: 
+                for today, characters in self.usage["usage_history"]["tts_characters"][tts_model].items():
+                    if today.startswith(month):
+                        characters_month += characters
+        return int(characters_day), int(characters_month)
+
+
     # transcription usage functions:
 
     def add_transcription_seconds(self, seconds, minute_price=0.006):
@@ -236,13 +289,14 @@ def get_current_cost(self):
         cost_all_time = self.usage["current_cost"].get("all_time", self.initialize_all_time_cost())
         return {"cost_today": cost_day, "cost_month": cost_month, "cost_all_time": cost_all_time}
 
-    def initialize_all_time_cost(self, tokens_price=0.002, image_prices="0.016,0.018,0.02", minute_price=0.006):
+    def initialize_all_time_cost(self, tokens_price=0.002, image_prices="0.016,0.018,0.02", minute_price=0.006, tts_prices='0.015,0.030'):
         """Get total USD amount of all requests in history
         
         :param tokens_price: price per 1000 tokens, defaults to 0.002
         :param image_prices: prices for images of sizes ["256x256", "512x512", "1024x1024"],
             defaults to [0.016, 0.018, 0.02]
         :param minute_price: price per minute transcription, defaults to 0.006
+        :param character_price: price per character tts per model ['tts-1', 'tts-1-hd'], defaults to [0.015, 0.030]
         :return: total cost of all requests
         """
         total_tokens = sum(self.usage['usage_history']['chat_tokens'].values())
@@ -255,5 +309,9 @@ def initialize_all_time_cost(self, tokens_price=0.002, image_prices="0.016,0.018
         total_transcription_seconds = sum(self.usage['usage_history']['transcription_seconds'].values())
         transcription_cost = round(total_transcription_seconds * minute_price / 60, 2)
 
-        all_time_cost = token_cost + transcription_cost + image_cost
+        total_characters = [sum(tts_model.values()) for tts_model in self.usage['usage_history']['tts_characters'].values()]
+        tts_prices_list = [float(x) for x in tts_prices.split(',')]
+        tts_cost = round(sum([count * price / 1000 for count, price in zip(total_characters, tts_prices_list)]), 2)
+
+        all_time_cost = token_cost + transcription_cost + image_cost + tts_cost
         return all_time_cost