diff --git a/programs/speechSynthesis/README.md b/programs/speechSynthesis/README.md index 177125d..7845108 100644 --- a/programs/speechSynthesis/README.md +++ b/programs/speechSynthesis/README.md @@ -2,32 +2,46 @@ ## Installation -Through pip: +Note that **Python 3.9+ is required**. Through pip: ```bash -pip3 install mycroft-mimic3-tts +pip install piper-tts ``` -Alternatively, install from sources: https://github.com/MycroftAI/mimic3 +Alternatively, install from sources: ## Download voice models -All voice data is located in a separate repository: https://github.com/MycroftAI/mimic3-voices +All voice data is stored in Hugging Face: -To manually issue the download of all Spanish voices, run: +By default, `speechSynthesis` assumes `--context speechSynthesis --from speechSynthesis.ini`, i.e. it will spawn a `ResourceFinder` instance and look for a `speechSynthesis.ini` placed in a `speechSynthesis/` directory following the [YARP data directory specification](https://www.yarp.it/latest/yarp_data_dirs.html). These default context and configuration file can be modified via command line arguments, although it shouldn't be necessary. Voice models need to be downloaded either manually or via `piper` into the sibling directory of the .ini configuration file. + +It is advised to import the `speechSynthesis` context after installating the speech repository: ```bash -mimic3-download 'es_ES/*' +yarp-config context --import speechSynthesis ``` -In case the process gets stuck, download and unpack the files into `${HOME}/.local/share/mycroft/mimic3/voices`. However, you'll probably need to download the *generator.onnx* file separately (via GitHub) since it is handled by Git LFS. +This command will copy the installed context into a writable user-local path such as `$HOME/.local/share/yarp/contexts/speechSynthesis`. Change into this directory and run `piper` (see examples below) to automatically download the voice models, or download them manually from the Hugging Face repository and place them here. + +The following command will output nothing, it simply downloads the model (if available in Hugging Face) and blocks the terminal since it expects input from stdin (kill it with Ctrl+C after the download is complete): + +```bash +piper --model es_ES-davefx-medium +``` ## Troubleshooting -Try this: +Try this (requires `pip install aplay`): + +```bash +echo "hola, me llamo teo y tengo 10 años" | piper --model es_ES-davefx-medium --output-raw | aplay -r 22050 -f S16_LE -t raw - +``` + +Alternatively, keep the application open while reading from stdin: ```bash -mimic3 --voice es_ES/m-ailabs#tux "hola, me llamo teo y tengo 10 años" +piper --model es_ES-davefx-medium --output-raw | aplay -r 22050 -f S16_LE -t raw - ``` -To enable GPU acceleration, run `pip3 install onnxruntime-gpu` and issue the `mimic3` command with `--cuda`. The `speechSynthesis` app also accepts this parameter. +To enable GPU acceleration, run `pip install onnxruntime-gpu` and issue the `piper` command with `--cuda`. The `speechSynthesis` app also accepts this parameter. diff --git a/programs/speechSynthesis/speechSynthesis.py b/programs/speechSynthesis/speechSynthesis.py index 0b81a82..0f5ed44 100644 --- a/programs/speechSynthesis/speechSynthesis.py +++ b/programs/speechSynthesis/speechSynthesis.py @@ -1,181 +1,231 @@ #!/usr/bin/env python3 -# adapted from https://github.com/MycroftAI/mimic3/blob/be72c18/mimic3_tts/__main__.py - import argparse -import queue -import signal -import shlex -import shutil -import subprocess -import tempfile -import threading -import time - -import mimic3_tts -import yarp import roboticslab_speech +import sounddevice as sd +import yarp -PLAY_PROGRAMS = ['paplay', 'play -q', 'aplay -q'] - -class TextToSpeechResponder(roboticslab_speech.SpeechSynthesis): - def __init__(self, engine): - super().__init__() - self.engine = engine - self.is_playing = False - self.p = None - self.result_queue = queue.Queue(maxsize=5) - self.result_thread = threading.Thread(target=self._process_result, daemon=True) - self.result_thread.start() - - def setLanguage(self, language): - if language.startswith('#'): - # same voice, different speaker - self.engine.speaker = language[1:] - else: - # different voice - self.engine.voice = language - - if self.engine.voice not in list(self.getSupportedLangs()): - print('Voice not available: %s' % self.engine.voice) - return False - else: - self.engine.preload_voice(self.engine.voice) - print('Loaded voice: %s (speaker: %s)' % (self.engine.voice, self.engine.speaker or 'default')) - return True - - def setSpeed(self, speed): - self.engine.rate = float(speed) / 100 - return True +from abc import ABC, abstractmethod - def setPitch(self, pitch): - return super().setPitch(pitch) +class SynthesizerFactory(ABC): + @abstractmethod + def create(self, player): + pass - def getSpeed(self): - return int(self.engine.rate * 100) +class PiperSynthesizerFactory(SynthesizerFactory): + def __init__(self, model, use_cuda, rf): + self.model = model + self.use_cuda = use_cuda + self.rf = rf - def getPitch(self): - return super().getPitch() + def create(self, player): + return PiperSynthesizer(player, self.model, self.use_cuda, self.rf) - def getSupportedLangs(self): - all_voices = sorted(list(self.engine.get_voices()), key=lambda v: v.key) - local_voices = filter(lambda v: not v.location.startswith('http'), all_voices) - available_voices = [v.key for v in local_voices] - return yarp.SVector(available_voices) +class SpeechSynthesizer(roboticslab_speech.SpeechSynthesis): + def __init__(self, player): + super().__init__() + self._player = player def say(self, text): - self.engine.begin_utterance() - self.engine.speak_text(text) - - for result in self.engine.end_utterance(): - self.result_queue.put(result) - + print('saying: %s' % text) + self._player.set_generator(self._get_generator(text)) return True def play(self): - return super().play() + self._player.resume() + print('resumed') + return True def pause(self): - return super().pause() + self._player.pause() + print('paused') + return True def stop(self): - if self.p: - self.p.terminate() - + self._player.clear_generator() + print('stopped') return True def checkSayDone(self): - return not self.is_playing + return not self._player.is_playing() - def _process_result(self): - while True: - result = self.result_queue.get() + def setSpeed(self, speed): + return super().setSpeed(speed) - if result is None: - break + def setPitch(self, pitch): + return super().setPitch(pitch) - wav_bytes = result.to_wav_bytes() + def getSpeed(self): + return super().getSpeed() - if not wav_bytes: - continue + def getPitch(self): + return super().getPitch() - with tempfile.NamedTemporaryFile(mode='wb+', suffix='.wav') as wav_file: - wav_file.write(wav_bytes) - wav_file.seek(0) + @abstractmethod + def get_sample_rate(self): + pass - for play_program in reversed(PLAY_PROGRAMS): - play_cmd = shlex.split(play_program) + @abstractmethod + def _get_generator(self, text): + pass - if not shutil.which(play_cmd[0]): - continue +class PiperSynthesizer(SpeechSynthesizer): + def __init__(self, player, model, use_cuda, rf): + super().__init__(player) + self._rf = rf + self._use_cuda = use_cuda + self._voice = self._load_model(model) - play_cmd.append(wav_file.name) - self.is_playing = True + def _load_model(self, model): + from piper import PiperVoice - with subprocess.Popen(play_cmd) as self.p: - try: - self.p.wait() - except: # e.g. on keyboard interrupt - self.p.kill() + if not model.endswith('.onnx'): + model += '.onnx' - self.is_playing = False - break + path = rf.findFileByName(model) + return PiperVoice.load(path, use_cuda=self._use_cuda) -parser = argparse.ArgumentParser(prog='speechSynthesis', description='TTS service running a Mimic 3 engine') -parser.add_argument('--voice', '-v', help='Name of voice (expected in /)', required=True) -parser.add_argument('--speaker', '-s', help='Name or number of speaker (default: first speaker)') -parser.add_argument('--noise-scale', type=float, help='Noise scale [0-1], default is 0.667') -parser.add_argument('--length-scale', type=float, help='Length scale (1.0 is default speed, 0.5 is 2x faster)') -parser.add_argument('--noise-w', type=float, help='Variation in cadence [0-1], default is 0.8') -parser.add_argument('--cuda', action='store_true', help='Use Onnx CUDA execution provider (requires onnxruntime-gpu)') -parser.add_argument('--port', '-p', default='/speechSynthesis', help='YARP port prefix') + def get_sample_rate(self): + return self._voice.config.sample_rate -args = parser.parse_args() + def _get_generator(self, text): + return self._voice.synthesize_stream_raw(text) -tts = mimic3_tts.Mimic3TextToSpeechSystem( - mimic3_tts.Mimic3Settings( - length_scale=args.length_scale, - noise_scale=args.noise_scale, - noise_w=args.noise_w, - use_cuda=args.cuda, - ) -) + def setLanguage(self, language): + try: + self._voice = self._load_model(language) + print('switched language to %s' % language) + return True + except IOError as e: + print(e) + return False -tts.voice = args.voice -tts.speaker = args.speaker + def getSupportedLangs(self): + from pathlib import Path + path = Path(self._rf.findFile('from')) + return yarp.SVector([f.stem for f in path.parent.iterdir() if f.is_file() and f.name.endswith('.onnx')]) + +def int_or_str(text): + """Helper function for argument parsing.""" + try: + return int(text) + except ValueError: + return text + +BACKENDS = ['piper'] + +parser = argparse.ArgumentParser(description='YARP service that transforms text into live audio output', add_help=False) +parser.add_argument('--list-devices', action='store_true', help='list available audio devices and exit') +parser.add_argument('--list-backends', action='store_true', help='list available TTS backends and exit') +args, remaining = parser.parse_known_args() + +if args.list_devices: + print(sd.query_devices()) + raise SystemExit +elif args.list_backends: + print('\n'.join(BACKENDS)) + raise SystemExit -print('Preloading voice: %s' % args.voice) -tts.preload_voice(args.voice) +parser = argparse.ArgumentParser(description=parser.description, formatter_class=argparse.ArgumentDefaultsHelpFormatter, parents=[parser]) +parser.add_argument('--backend', '-b', type=str, required=True, help='ASR backend engine') +parser.add_argument('--device', '-d', type=int_or_str, help='input device (numeric ID or substring)') +parser.add_argument('--model', type=str, required=True, help='model, e.g. es_ES-davefx-medium') +parser.add_argument('--cuda', action='store_true', help='use ONNX CUDA execution provider (requires onnxruntime-gpu)') +parser.add_argument('--prefix', '-p', type=str, default='/speechSynthesis', help='YARP port prefix') +parser.add_argument('--context', type=str, default='speechSynthesis', help='YARP context directory') +parser.add_argument('--from', type=str, dest='ini', default='speechSynthesis.ini', help='YARP configuration (.ini) file') +args = parser.parse_args(remaining) yarp.Network.init() +rf = yarp.ResourceFinder() +rf.setDefaultContext(args.context) +rf.setDefaultConfigFile(args.ini) + +if args.backend == 'piper': + synthesizer_factory = PiperSynthesizerFactory(args.model, args.cuda, rf) +else: + print('Backend not available, must be one of: %s' % ', '.join(BACKENDS)) + raise SystemExit + if not yarp.Network.checkNetwork(): - print('YARP network not found') + print('No YARP network available') raise SystemExit rpc = yarp.RpcServer() -processor = TextToSpeechResponder(tts) -if not rpc.open(args.port + '/rpc:s'): - print('Cannot open port %s' % rpc.getName()) +if not rpc.open(args.prefix + '/rpc:s'): + print('Unable to open RPC port') raise SystemExit -processor.yarp().attachAsServer(rpc) - -quitRequested = False - -def askToStop(): - global quitRequested - quitRequested = True +class CallbackPlayer: + def __init__(self): + self._generator = None + self._queued_generator = None + self._is_paused = False -signal.signal(signal.SIGINT, lambda signal, frame: askToStop()) -signal.signal(signal.SIGTERM, lambda signal, frame: askToStop()) + def set_generator(self, generator): + self._generator = generator + self._queued_generator = None + self._is_paused = False -while not quitRequested: - time.sleep(0.1) + def clear_generator(self): + self._generator = None + self._queued_generator = None + self._is_paused = False -rpc.interrupt() -rpc.close() + def pause(self): + self._is_paused = True + + def resume(self): + self._is_paused = False + + def is_playing(self): + return self._generator is not None and not self._is_paused + + def callback(self, outdata, frames, time, status): + # https://stackoverflow.com/a/62609827 + + if self.is_playing(): + try: + raw = next(self._generator) + + if len(outdata) > len(raw): + outdata[:len(raw)] = raw + outdata[len(raw):] = b'\x00' * (len(outdata) - len(raw)) + elif len(outdata) < len(raw): + outdata[:] = raw[:len(outdata)] + self._queued_generator = self._generator + self._generator = iter([raw[len(outdata):]]) + else: + outdata[:] = raw + + return + except StopIteration: + if self._queued_generator is not None: + self._generator = self._queued_generator + self._queued_generator = None + else: + self.clear_generator() + + outdata[:] = b'\x00' * len(outdata) + +try: + player = CallbackPlayer() + synthesizer = synthesizer_factory.create(player) + + with sd.RawOutputStream(samplerate=synthesizer.get_sample_rate(), + blocksize=1024, + device=args.device, + dtype='int16', + channels=1, + callback=player.callback) as stream: + synthesizer.yarp().attachAsServer(rpc) -processor.result_queue.put(None) -processor.result_thread.join() + while True: + import time + time.sleep(0.1) +except KeyboardInterrupt: + rpc.interrupt() + rpc.close() + parser.exit(0) diff --git a/share/CMakeLists.txt b/share/CMakeLists.txt index 566b8f6..b499491 100644 --- a/share/CMakeLists.txt +++ b/share/CMakeLists.txt @@ -4,3 +4,4 @@ yarp_install(FILES applications/ymanager.ini DESTINATION ${ROBOTICSLAB-SPEECH_APPLICATIONS_INSTALL_DIR}) add_subdirectory(speechRecognition) +add_subdirectory(speechSynthesis) diff --git a/share/speechSynthesis/CMakeLists.txt b/share/speechSynthesis/CMakeLists.txt new file mode 100644 index 0000000..ef58d68 --- /dev/null +++ b/share/speechSynthesis/CMakeLists.txt @@ -0,0 +1,2 @@ +yarp_install(FILES speechSynthesis.ini + DESTINATION ${ROBOTICSLAB-SPEECH_CONTEXTS_INSTALL_DIR}/speechSynthesis) diff --git a/share/speechSynthesis/speechSynthesis.ini b/share/speechSynthesis/speechSynthesis.ini new file mode 100644 index 0000000..9a3005f --- /dev/null +++ b/share/speechSynthesis/speechSynthesis.ini @@ -0,0 +1,2 @@ +/// dummy file to make sure the directory is created +/// NOTE: this file must be located in the same directory as the voice config files (*.onnx.json)