From d29e7aa27294c8b73252d8f55bd627ab118339fb Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 16 Aug 2024 22:07:09 +0800 Subject: [PATCH] Fix looking up OOVs in lexicon.txt for MeloTTS models. If an English word does not exist in the lexicon, we split it into characters. For instance, if the word TTS does not exist in lexicon.txt, we split it into 3 characters T, T, and S. --- sherpa-onnx/csrc/melo-tts-lexicon.cc | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/sherpa-onnx/csrc/melo-tts-lexicon.cc b/sherpa-onnx/csrc/melo-tts-lexicon.cc index fb39de8d2..e379b9c2f 100644 --- a/sherpa-onnx/csrc/melo-tts-lexicon.cc +++ b/sherpa-onnx/csrc/melo-tts-lexicon.cc @@ -136,6 +136,22 @@ class MeloTtsLexicon::Impl { ans.tokens.insert(ans.tokens.end(), ids.tokens.begin(), ids.tokens.end()); ans.tones.insert(ans.tones.end(), ids.tones.begin(), ids.tones.end()); + } else { + // If the lexicon does not contain the word, we split the word into + // characters. + // + // For instance, if the word is TTS and it is does not exist + // in the lexicon, we split it into 3 characters: T T S + std::string s; + for (char c : word) { + s = c; + if (word2ids_.count(s)) { + const auto &t = word2ids_.at(s); + ans.tokens.insert(ans.tokens.end(), t.tokens.begin(), + t.tokens.end()); + ans.tones.insert(ans.tones.end(), t.tones.begin(), t.tones.end()); + } + } } }