From 655e0fa83650246f71fb5c823d614f4a2934ade1 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 14 Oct 2023 14:21:53 +0800 Subject: [PATCH] add python API and examples for TTS (#364) --- .github/scripts/test-python.sh | 20 +++ .github/workflows/run-python-test.yaml | 9 +- CMakeLists.txt | 2 +- cmake/cmake_extension.py | 4 + python-api-examples/offline-tts.py | 120 ++++++++++++++++++ setup.py | 4 + sherpa-onnx/csrc/offline-tts-vits-impl.h | 4 +- sherpa-onnx/python/csrc/CMakeLists.txt | 3 + .../python/csrc/offline-tts-model-config.cc | 32 +++++ .../python/csrc/offline-tts-model-config.h | 16 +++ .../csrc/offline-tts-vits-model-config.cc | 27 ++++ .../csrc/offline-tts-vits-model-config.h | 16 +++ sherpa-onnx/python/csrc/offline-tts.cc | 46 +++++++ sherpa-onnx/python/csrc/offline-tts.h | 16 +++ sherpa-onnx/python/csrc/sherpa-onnx.cc | 3 + sherpa-onnx/python/sherpa_onnx/__init__.py | 4 + 16 files changed, 320 insertions(+), 6 deletions(-) create mode 100755 python-api-examples/offline-tts.py create mode 100644 sherpa-onnx/python/csrc/offline-tts-model-config.cc create mode 100644 sherpa-onnx/python/csrc/offline-tts-model-config.h create mode 100644 sherpa-onnx/python/csrc/offline-tts-vits-model-config.cc create mode 100644 sherpa-onnx/python/csrc/offline-tts-vits-model-config.h create mode 100644 sherpa-onnx/python/csrc/offline-tts.cc create mode 100644 sherpa-onnx/python/csrc/offline-tts.h diff --git a/.github/scripts/test-python.sh b/.github/scripts/test-python.sh index 82f9e1cf2..e6f37764a 100755 --- a/.github/scripts/test-python.sh +++ b/.github/scripts/test-python.sh @@ -8,6 +8,24 @@ log() { echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } +log "Offline TTS test" + +wget -qq https://huggingface.co/csukuangfj/vits-ljs/resolve/main/vits-ljs.onnx +wget -qq https://huggingface.co/csukuangfj/vits-ljs/resolve/main/lexicon.txt +wget -qq https://huggingface.co/csukuangfj/vits-ljs/resolve/main/tokens.txt + +python3 ./python-api-examples/offline-tts.py \ + --vits-model=./vits-ljs.onnx \ + --vits-lexicon=./lexicon.txt \ + --vits-tokens=./tokens.txt \ + --output-filename=./tts.wav \ + 'liliana, the most beautiful and lovely assistant of our team!' + +ls -lh ./tts.wav +file ./tts.wav + +rm -v vits-ljs.onnx ./lexicon.txt ./tokens.txt + mkdir -p /tmp/icefall-models dir=/tmp/icefall-models @@ -171,3 +189,5 @@ rm -rf $repo git clone https://github.com/pkufool/sherpa-test-data /tmp/sherpa-test-data python3 sherpa-onnx/python/tests/test_text2token.py --verbose + +rm -rf /tmp/sherpa-test-data diff --git a/.github/workflows/run-python-test.yaml b/.github/workflows/run-python-test.yaml index acc1c68ff..56f98a9ff 100644 --- a/.github/workflows/run-python-test.yaml +++ b/.github/workflows/run-python-test.yaml @@ -42,7 +42,7 @@ jobs: python-version: "3.10" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: fetch-depth: 0 @@ -54,7 +54,7 @@ jobs: - name: Install Python dependencies shell: bash run: | - python3 -m pip install --upgrade pip numpy sentencepiece==0.1.96 + python3 -m pip install --upgrade pip numpy sentencepiece==0.1.96 soundfile - name: Install sherpa-onnx shell: bash @@ -65,3 +65,8 @@ jobs: shell: bash run: | .github/scripts/test-python.sh + + - uses: actions/upload-artifact@v3 + with: + name: tts-generated-test-files + path: tts.wav diff --git a/CMakeLists.txt b/CMakeLists.txt index d12bc929f..2726998fd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) project(sherpa-onnx) -set(SHERPA_ONNX_VERSION "1.7.21") +set(SHERPA_ONNX_VERSION "1.8.0") # Disable warning about # diff --git a/cmake/cmake_extension.py b/cmake/cmake_extension.py index 4ff1f8c40..e8b9c6586 100644 --- a/cmake/cmake_extension.py +++ b/cmake/cmake_extension.py @@ -137,6 +137,7 @@ def build_extension(self, ext: setuptools.extension.Extension): binaries += ["sherpa-onnx-offline-websocket-server"] binaries += ["sherpa-onnx-online-websocket-client"] binaries += ["sherpa-onnx-vad-microphone"] + binaries += ["sherpa-onnx-offline-tts"] if is_windows(): binaries += ["kaldi-native-fbank-core.dll"] @@ -144,6 +145,9 @@ def build_extension(self, ext: setuptools.extension.Extension): binaries += ["sherpa-onnx-core.dll"] binaries += ["sherpa-onnx-portaudio.dll"] binaries += ["onnxruntime.dll"] + binaries += ["kaldi-decoder-core.dll"] + binaries += ["sherpa-onnx-fst.dll"] + binaries += ["sherpa-onnx-kaldifst-core.dll"] for f in binaries: suffix = "" if "dll" in f else suffix diff --git a/python-api-examples/offline-tts.py b/python-api-examples/offline-tts.py new file mode 100755 index 000000000..e264a0926 --- /dev/null +++ b/python-api-examples/offline-tts.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2023 Xiaomi Corporation + +""" +This file demonstrates how to use sherpa-onnx Python API to generate audio +from text, i.e., text-to-speech. + +Usage: + +1. Download a model + +wget https://huggingface.co/csukuangfj/vits-ljs/resolve/main/vits-ljs.onnx +wget https://huggingface.co/csukuangfj/vits-ljs/resolve/main/lexicon.txt +wget https://huggingface.co/csukuangfj/vits-ljs/resolve/main/tokens.txt + +python3 ./python-api-examples/offline-tts.py \ + --vits-model=./vits-ljs.onnx \ + --vits-lexicon=./lexicon.txt \ + --vits-tokens=./tokens.txt \ + --output-filename=./generated.wav \ + 'liliana, the most beautiful and lovely assistant of our team!' +""" + +import argparse + +import sherpa_onnx +import soundfile as sf + + +def get_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument( + "--vits-model", + type=str, + help="Path to vits model.onnx", + ) + + parser.add_argument( + "--vits-lexicon", + type=str, + help="Path to lexicon.txt", + ) + + parser.add_argument( + "--vits-tokens", + type=str, + help="Path to tokens.txt", + ) + + parser.add_argument( + "--output-filename", + type=str, + default="./generated.wav", + help="Path to save generated wave", + ) + + parser.add_argument( + "--debug", + type=bool, + default=False, + help="True to show debug messages", + ) + + parser.add_argument( + "--provider", + type=str, + default="cpu", + help="valid values: cpu, cuda, coreml", + ) + + parser.add_argument( + "--num-threads", + type=int, + default=1, + help="Number of threads for neural network computation", + ) + + parser.add_argument( + "text", + type=str, + help="The input text to generate audio for", + ) + + return parser.parse_args() + + +def main(): + args = get_args() + print(args) + + tts_config = sherpa_onnx.OfflineTtsConfig( + model=sherpa_onnx.OfflineTtsModelConfig( + vits=sherpa_onnx.OfflineTtsVitsModelConfig( + model=args.vits_model, + lexicon=args.vits_lexicon, + tokens=args.vits_tokens, + ), + provider=args.provider, + debug=args.debug, + num_threads=args.num_threads, + ) + ) + tts = sherpa_onnx.OfflineTts(tts_config) + audio = tts.generate(args.text) + sf.write( + args.output_filename, + audio.samples, + samplerate=audio.sample_rate, + subtype="PCM_16", + ) + print(f"Saved to {args.output_filename}") + print(f"The text is '{args.text}'") + + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index d6084a19b..0cc39c26b 100644 --- a/setup.py +++ b/setup.py @@ -57,12 +57,16 @@ def get_binaries_to_install(): binaries += ["sherpa-onnx-offline-websocket-server"] binaries += ["sherpa-onnx-online-websocket-client"] binaries += ["sherpa-onnx-vad-microphone"] + binaries += ["sherpa-onnx-offline-tts"] if is_windows(): binaries += ["kaldi-native-fbank-core.dll"] binaries += ["sherpa-onnx-c-api.dll"] binaries += ["sherpa-onnx-core.dll"] binaries += ["sherpa-onnx-portaudio.dll"] binaries += ["onnxruntime.dll"] + binaries += ["kaldi-decoder-core.dll"] + binaries += ["sherpa-onnx-fst.dll"] + binaries += ["sherpa-onnx-kaldifst-core.dll"] exe = [] for f in binaries: diff --git a/sherpa-onnx/csrc/offline-tts-vits-impl.h b/sherpa-onnx/csrc/offline-tts-vits-impl.h index e9b94064c..abbf28193 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-impl.h +++ b/sherpa-onnx/csrc/offline-tts-vits-impl.h @@ -21,9 +21,7 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { explicit OfflineTtsVitsImpl(const OfflineTtsConfig &config) : model_(std::make_unique(config.model)), lexicon_(config.model.vits.lexicon, config.model.vits.tokens, - model_->Punctuations()) { - SHERPA_ONNX_LOGE("config: %s\n", config.ToString().c_str()); - } + model_->Punctuations()) {} GeneratedAudio Generate(const std::string &text) const override { std::vector x = lexicon_.ConvertTextToTokenIds(text); diff --git a/sherpa-onnx/python/csrc/CMakeLists.txt b/sherpa-onnx/python/csrc/CMakeLists.txt index a5832d22e..62500bdc8 100644 --- a/sherpa-onnx/python/csrc/CMakeLists.txt +++ b/sherpa-onnx/python/csrc/CMakeLists.txt @@ -14,6 +14,9 @@ pybind11_add_module(_sherpa_onnx offline-stream.cc offline-tdnn-model-config.cc offline-transducer-model-config.cc + offline-tts-model-config.cc + offline-tts-vits-model-config.cc + offline-tts.cc offline-whisper-model-config.cc offline-zipformer-ctc-model-config.cc online-lm-config.cc diff --git a/sherpa-onnx/python/csrc/offline-tts-model-config.cc b/sherpa-onnx/python/csrc/offline-tts-model-config.cc new file mode 100644 index 000000000..e5e86d968 --- /dev/null +++ b/sherpa-onnx/python/csrc/offline-tts-model-config.cc @@ -0,0 +1,32 @@ +// sherpa-onnx/python/csrc/offline-tts-model-config.cc +// +// Copyright (c) 2023 Xiaomi Corporation + +#include "sherpa-onnx/python/csrc/offline-tts-model-config.h" + +#include + +#include "sherpa-onnx/csrc/offline-tts-model-config.h" +#include "sherpa-onnx/python/csrc/offline-tts-vits-model-config.h" + +namespace sherpa_onnx { + +void PybindOfflineTtsModelConfig(py::module *m) { + PybindOfflineTtsVitsModelConfig(m); + + using PyClass = OfflineTtsModelConfig; + + py::class_(*m, "OfflineTtsModelConfig") + .def(py::init<>()) + .def(py::init(), + py::arg("vits"), py::arg("num_threads") = 1, + py::arg("debug") = false, py::arg("provider") = "cpu") + .def_readwrite("vits", &PyClass::vits) + .def_readwrite("num_threads", &PyClass::num_threads) + .def_readwrite("debug", &PyClass::debug) + .def_readwrite("provider", &PyClass::provider) + .def("__str__", &PyClass::ToString); +} + +} // namespace sherpa_onnx diff --git a/sherpa-onnx/python/csrc/offline-tts-model-config.h b/sherpa-onnx/python/csrc/offline-tts-model-config.h new file mode 100644 index 000000000..48a08373c --- /dev/null +++ b/sherpa-onnx/python/csrc/offline-tts-model-config.h @@ -0,0 +1,16 @@ +// sherpa-onnx/python/csrc/offline-tts-model-config.h +// +// Copyright (c) 2023 Xiaomi Corporation + +#ifndef SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_MODEL_CONFIG_H_ +#define SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_MODEL_CONFIG_H_ + +#include "sherpa-onnx/python/csrc/sherpa-onnx.h" + +namespace sherpa_onnx { + +void PybindOfflineTtsModelConfig(py::module *m); + +} + +#endif // SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_MODEL_CONFIG_H_ diff --git a/sherpa-onnx/python/csrc/offline-tts-vits-model-config.cc b/sherpa-onnx/python/csrc/offline-tts-vits-model-config.cc new file mode 100644 index 000000000..2471c3f5e --- /dev/null +++ b/sherpa-onnx/python/csrc/offline-tts-vits-model-config.cc @@ -0,0 +1,27 @@ +// sherpa-onnx/python/csrc/offline-tts-vits-model-config.cc +// +// Copyright (c) 2023 Xiaomi Corporation + +#include "sherpa-onnx/python/csrc/offline-tts-vits-model-config.h" + +#include + +#include "sherpa-onnx/csrc/offline-tts-vits-model-config.h" + +namespace sherpa_onnx { + +void PybindOfflineTtsVitsModelConfig(py::module *m) { + using PyClass = OfflineTtsVitsModelConfig; + + py::class_(*m, "OfflineTtsVitsModelConfig") + .def(py::init<>()) + .def(py::init(), + py::arg("model"), py::arg("lexicon"), py::arg("tokens")) + .def_readwrite("model", &PyClass::model) + .def_readwrite("lexicon", &PyClass::lexicon) + .def_readwrite("tokens", &PyClass::tokens) + .def("__str__", &PyClass::ToString); +} + +} // namespace sherpa_onnx diff --git a/sherpa-onnx/python/csrc/offline-tts-vits-model-config.h b/sherpa-onnx/python/csrc/offline-tts-vits-model-config.h new file mode 100644 index 000000000..b9be3b150 --- /dev/null +++ b/sherpa-onnx/python/csrc/offline-tts-vits-model-config.h @@ -0,0 +1,16 @@ +// sherpa-onnx/python/csrc/offline-tts-vits-model-config.h +// +// Copyright (c) 2023 Xiaomi Corporation + +#ifndef SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_VITS_MODEL_CONFIG_H_ +#define SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_VITS_MODEL_CONFIG_H_ + +#include "sherpa-onnx/python/csrc/sherpa-onnx.h" + +namespace sherpa_onnx { + +void PybindOfflineTtsVitsModelConfig(py::module *m); + +} + +#endif // SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_VITS_MODEL_CONFIG_H_ diff --git a/sherpa-onnx/python/csrc/offline-tts.cc b/sherpa-onnx/python/csrc/offline-tts.cc new file mode 100644 index 000000000..199e4df2d --- /dev/null +++ b/sherpa-onnx/python/csrc/offline-tts.cc @@ -0,0 +1,46 @@ +// sherpa-onnx/python/csrc/offline-tts.cc +// +// Copyright (c) 2023 Xiaomi Corporation +#include "sherpa-onnx/python/csrc/offline-tts.h" + +#include "sherpa-onnx/csrc/offline-tts.h" +#include "sherpa-onnx/python/csrc/offline-tts-model-config.h" + +namespace sherpa_onnx { + +static void PybindGeneratedAudio(py::module *m) { + using PyClass = GeneratedAudio; + py::class_(*m, "GeneratedAudio") + .def(py::init<>()) + .def_readwrite("samples", &PyClass::samples) + .def_readwrite("sample_rate", &PyClass::sample_rate) + .def("__str__", [](PyClass &self) { + std::ostringstream os; + os << "GeneratedAudio(sample_rate=" << self.sample_rate << ", "; + os << "num_samples=" << self.samples.size() << ")"; + return os.str(); + }); +} + +static void PybindOfflineTtsConfig(py::module *m) { + PybindOfflineTtsModelConfig(m); + + using PyClass = OfflineTtsConfig; + py::class_(*m, "OfflineTtsConfig") + .def(py::init<>()) + .def(py::init(), py::arg("model")) + .def_readwrite("model", &PyClass::model) + .def("__str__", &PyClass::ToString); +} + +void PybindOfflineTts(py::module *m) { + PybindOfflineTtsConfig(m); + PybindGeneratedAudio(m); + + using PyClass = OfflineTts; + py::class_(*m, "OfflineTts") + .def(py::init(), py::arg("config")) + .def("generate", &PyClass::Generate); +} + +} // namespace sherpa_onnx diff --git a/sherpa-onnx/python/csrc/offline-tts.h b/sherpa-onnx/python/csrc/offline-tts.h new file mode 100644 index 000000000..fd14e1a2f --- /dev/null +++ b/sherpa-onnx/python/csrc/offline-tts.h @@ -0,0 +1,16 @@ +// sherpa-onnx/python/csrc/offline-tts.h +// +// Copyright (c) 2023 Xiaomi Corporation + +#ifndef SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_H_ +#define SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_H_ + +#include "sherpa-onnx/python/csrc/sherpa-onnx.h" + +namespace sherpa_onnx { + +void PybindOfflineTts(py::module *m); + +} + +#endif // SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_H_ diff --git a/sherpa-onnx/python/csrc/sherpa-onnx.cc b/sherpa-onnx/python/csrc/sherpa-onnx.cc index 27f5f827a..4af65fe76 100644 --- a/sherpa-onnx/python/csrc/sherpa-onnx.cc +++ b/sherpa-onnx/python/csrc/sherpa-onnx.cc @@ -13,6 +13,7 @@ #include "sherpa-onnx/python/csrc/offline-model-config.h" #include "sherpa-onnx/python/csrc/offline-recognizer.h" #include "sherpa-onnx/python/csrc/offline-stream.h" +#include "sherpa-onnx/python/csrc/offline-tts.h" #include "sherpa-onnx/python/csrc/online-lm-config.h" #include "sherpa-onnx/python/csrc/online-model-config.h" #include "sherpa-onnx/python/csrc/online-recognizer.h" @@ -45,6 +46,8 @@ PYBIND11_MODULE(_sherpa_onnx, m) { PybindVadModel(&m); PybindCircularBuffer(&m); PybindVoiceActivityDetector(&m); + + PybindOfflineTts(&m); } } // namespace sherpa_onnx diff --git a/sherpa-onnx/python/sherpa_onnx/__init__.py b/sherpa-onnx/python/sherpa_onnx/__init__.py index 61158d36b..d8ed0d4d8 100644 --- a/sherpa-onnx/python/sherpa_onnx/__init__.py +++ b/sherpa-onnx/python/sherpa_onnx/__init__.py @@ -2,6 +2,10 @@ CircularBuffer, Display, OfflineStream, + OfflineTts, + OfflineTtsConfig, + OfflineTtsModelConfig, + OfflineTtsVitsModelConfig, OnlineStream, SileroVadModelConfig, SpeechSegment,