hello-robot · hello-amal · Aug 7, 2024 · Jul 29, 2024 · Jul 29, 2024 · Jul 29, 2024
diff --git a/.gitattributes b/.gitattributes
@@ -1 +1,2 @@
 *.wav filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -36,6 +36,7 @@ jobs:
       run: git config --global --add safe.directory `pwd`
     - name: Install dependencies and code
       run: |
+        apt-get install espeak
         # ./install.sh -y --no-remove --no-submodules
         python -m pip install -e ./src[dev]
         python -m pip install mypy flake8 pytest 

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,4 +1,9 @@
 repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: check-added-large-files
+
   - repo: https://github.com/psf/black
     rev: 22.3.0
     hooks:
@@ -57,4 +62,5 @@ repos:
           (?x)^( 
             | .*\.svg 
             | .*\.tsv
+            | ^src/test/audio/test_text_to_speech.py
           )$ 
diff --git a/src/setup.py b/src/setup.py
@@ -59,9 +59,16 @@
         # UI tools
         "termcolor",
         # Audio
-        "pyaudio",
-        "wave",
+        "gtts",  # online TTS engine
+        "librosa",  # audio analysis (e.g., spectral similarity)
+        "PyAudio==0.2.14",  # the version specification is necessary because apt has 0.2.12 which is incompatible with recent numpy
         "openai-whisper",
+        "overrides",  # better inheritance of docstrings
+        "pydub",  # playback audio
+        "pyttsx3",  # offline TTS engine. TODO: There are better options, such as "tts_models/en/ljspeech/fast_pitch" from https://github.com/coqui-ai/TTS
+        "simpleaudio",  # playback audio
+        "sounddevice",  # Suppresses ALSA warnings when launching PyAudio
+        "wave",
         # These are not supported in python 3.12
         "scikit-fmm",
         "open3d",

diff --git a/src/stretch/app/text_to_speech.py b/src/stretch/app/text_to_speech.py
@@ -0,0 +1,136 @@
+"""
+This script adds a command-line interface (CLI) for text-to-speech. The CLI supports:
+- Typing text to convert to speech (added to queue).
+- Stopping ongoing speech.
+- Using the up arrow key to access the history.
+- Using tab auto-complete to search through the history.
+- Passing in a custom file to load the history (including pre-seeded text) from
+  and save it to.
+"""
+# Standard imports
+import argparse
+import os
+import readline  # Improve interactive input, e.g., up to access history, tab auto-completion.
+
+from stretch.audio.text_to_speech.executor import TextToSpeechExecutor
+from stretch.audio.text_to_speech.gtts_engine import GTTSTextToSpeech
+
+# Local imports
+from stretch.audio.utils.cli import HistoryCompleter
+
+
+class TextToSpeechComandLineInterface:
+    """
+    A command-line interface to use text-to-speech.
+    """
+
+    def __init__(self):
+        """
+        Initialize the TextToSpeechComandLineInterface.
+        """
+        self._executor = TextToSpeechExecutor(
+            engine=GTTSTextToSpeech(),
+        )
+
+    def start(self) -> None:
+        """
+        Start the text-to-speech command line interface.
+        """
+        self._executor.start()
+
+    def stop(self) -> None:
+        """
+        Stop the text-to-speech command line interface.
+        """
+        self._executor.stop()
+
+    def run(self):
+        """
+        Run the text-to-speech command line interface.
+        """
+        # Create the input prompt
+        print("****************************************************************")
+        print("Instructions:")
+        print("    Type a message to convert to speech.")
+        print("    Press S to stop the current message.")
+        print("    Press Q to exit and stop the current message.")
+        print("    Press Ctrl-C to exit without stopping the current message")
+        print("****************************************************************")
+
+        # Get the user input
+        while True:
+            # Get the user input
+            message = input("\nMessage (S to stop, Q to exit): ").strip()
+
+            # Process the special 1-character commands
+            if len(message) == 0:
+                continue
+            elif len(message) == 1:
+                if message.upper() == "Q":
+                    self._executor.stop_utterance()
+                    readline.remove_history_item(readline.get_current_history_length() - 1)
+                    raise KeyboardInterrupt
+                elif message.upper() == "S":
+                    # Stop the current message
+                    self._executor.stop_utterance()
+                    readline.remove_history_item(readline.get_current_history_length() - 1)
+                    continue
+
+            # Publish the message
+            self._executor.say_utterance(message)
+
+
+def get_args() -> argparse.Namespace:
+    """
+    Get the command-line arguments.
+    """
+    parser = argparse.ArgumentParser(description="Text-to-speech command line interface.")
+    parser.add_argument(
+        "--history_file",
+        type=str,
+        default="",
+        help="The history file to load and save.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    """
+    Run the text-to-speech command line interface.
+    """
+    # Get the arguments
+    args = get_args()
+
+    # Load the history
+    if len(args.history_file) > 0 and os.path.exists(args.history_file):
+        readline.read_history_file(args.history_file)
+        print(f"Loaded the history from {args.history_file}")
+    readline.set_completer(HistoryCompleter().complete)
+    readline.parse_and_bind("tab: complete")
+    readline.set_completer_delims("")  # Match the entire string, not individual words
+
+    # Initialize the text-to-speech command line interface
+    cli = TextToSpeechComandLineInterface()
+    cli.start()
+
+    # Run the text-to-speech command line interface
+    try:
+        cli.run()
+    except KeyboardInterrupt:
+        pass
+    finally:
+        # Save the history
+        if len(args.history_file) > 0:
+            readline.write_history_file(args.history_file)
+            print(f"Saved the history to {args.history_file}")
+        else:
+            print("Did not save the history. To do so, pass in --history_file")
+
+        # Stop the text-to-speech command line interface
+        cli.stop()
+
+        print("Cleanly terminated.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/stretch/audio/base.py b/src/stretch/audio/base.py
@@ -1,7 +1,15 @@
+# Standard imports
 import abc
+import logging
+from typing import Any
 
+# Third-party imports
 import numpy as np
 
+# Create the default logger
+logging.basicConfig(level=logging.INFO)
+DEFAULT_LOGGER = logging.getLogger(__name__)
+
 
 class AbstractSpeechToText(abc.ABC):
     """Basic speech to text module"""
@@ -27,3 +35,169 @@ def transcribe_file(self, audio_file: str) -> str:
             str: Transcribed text.
         """
         pass
+
+
+class AbstractTextToSpeech(abc.ABC):
+    """
+    Abstract base class for a text-to-speech engine that supports:
+      - Setting the voice ID.
+      - Setting the speed to default or slow.
+      - Asynchronously speaking text.
+      - Interrupting speech.
+    """
+
+    def __init__(self, logger: logging.Logger = DEFAULT_LOGGER):
+        """
+        Initialize the text-to-speech engine.
+
+        Parameters
+        ----------
+        logger : logging.Logger
+            The logger to use for logging messages.
+        """
+        self._logger = logger
+        self._voice_ids: list[str] = []
+        self._voice_id = ""
+        self._is_slow = False
+
+        # Whether or not this engine can speak asynchronously or not.
+        self._can_say_async = False
+
+    @property
+    def voice_ids(self) -> list[str]:
+        """
+        Get the list of voice IDs available for the text-to-speech engine.
+
+        Returns
+        -------
+        list[str]
+            The list of voice IDs.
+        """
+        return self._voice_ids
+
+    @property
+    def voice_id(self) -> str:
+        """
+        Get the current voice ID for the text-to-speech engine.
+
+        Returns
+        -------
+        str
+            The current voice ID.
+        """
+        return self._voice_id
+
+    @voice_id.setter
+    def voice_id(self, voice_id: str) -> None:
+        """
+        Set the current voice ID for the text-to-speech engine.
+
+        Parameters
+        ----------
+        voice_id : str
+            The voice ID to set.
+        """
+        if voice_id in self._voice_ids:
+            self._voice_id = voice_id
+        else:
+            self._logger.error(f"Invalid voice ID: {voice_id}")
+
+    @property
+    def is_slow(self) -> bool:
+        """
+        Get whether the text-to-speech engine is set to speak slowly.
+
+        Returns
+        -------
+        bool
+            Whether the text-to-speech engine is set to speak slowly.
+        """
+        return self._is_slow
+
+    @is_slow.setter
+    def is_slow(self, is_slow: bool) -> None:
+        """
+        Set whether the text-to-speech engine is set to speak slowly.
+
+        Parameters
+        ----------
+        is_slow : bool
+            Whether to set the text-to-speech engine to speak slowly
+        """
+        self._is_slow = is_slow
+
+    @abc.abstractmethod
+    def say_async(self, text: str) -> None:
+        """
+        Speak the given text asynchronously.
+
+        Parameters
+        ----------
+        text : str
+            The text to speak.
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def is_speaking(self) -> bool:
+        """
+        Return whether the text-to-speech engine is currently speaking.
+
+        Returns
+        -------
+        bool
+            Whether the text-to-speech engine is currently speaking.
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def say(self, text: str) -> None:
+        """
+        Speak the given text synchronously.
+
+        Parameters
+        ----------
+        text : str
+            The text to speak.
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def stop(self) -> None:
+        """
+        Stop speaking the current text.
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    def is_file_type_supported(filepath: str) -> bool:
+        """
+        Checks whether the file type is supported by the text-to-speech engine.
+        This is a static method to enforce that every text-to-speech engine
+        supports the same file type(s). Currently, only MP3 is supported.
+
+        Parameters
+        ----------
+        filepath : str
+            The path of the file to check.
+
+        Returns
+        -------
+        bool
+            Whether the file type is supported.
+        """
+        return filepath.lower().strip().endswith(".mp3")
+
+    @abc.abstractmethod
+    def save_to_file(self, text: str, filepath: str, **kwargs: Any):
+        """
+        Save the given text to an audio file.
+
+        Parameters
+        ----------
+        text : str
+            The text to save.
+        filepath : str
+            The path to save the audio file.
+        """
+        raise NotImplementedError
diff --git a/src/stretch/audio/text_to_speech/__init__.py b/src/stretch/audio/text_to_speech/__init__.py