From 04901fb2e4b74953bb5733205adcb7cb13655a06 Mon Sep 17 00:00:00 2001 From: Julian Weber Date: Tue, 14 Nov 2023 16:07:17 +0100 Subject: [PATCH 1/4] Add speed control for inference (#3214) * Add speed control for inference * Fix XTTS tests * Add speed control tests --- TTS/tts/models/xtts.py | 17 +++++++++++++++++ tests/zoo_tests/test_models.py | 34 +++++++++++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index b277c3ac72..9198591273 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -530,8 +530,10 @@ def inference( top_p=0.85, do_sample=True, num_beams=1, + speed=1.0, **hf_generate_kwargs, ): + length_scale = 1.0 / max(speed, 0.05) text = text.strip().lower() text_tokens = torch.IntTensor(self.tokenizer.encode(text, lang=language)).unsqueeze(0).to(self.device) @@ -584,6 +586,13 @@ def inference( gpt_latents = gpt_latents[:, :k] break + if length_scale != 1.0: + gpt_latents = F.interpolate( + gpt_latents.transpose(1, 2), + scale_factor=length_scale, + mode="linear" + ).transpose(1, 2) + wav = self.hifigan_decoder(gpt_latents, g=speaker_embedding) return { @@ -634,8 +643,10 @@ def inference_stream( top_k=50, top_p=0.85, do_sample=True, + speed=1.0, **hf_generate_kwargs, ): + length_scale = 1.0 / max(speed, 0.05) text = text.strip().lower() text_tokens = torch.IntTensor(self.tokenizer.encode(text, lang=language)).unsqueeze(0).to(self.device) @@ -674,6 +685,12 @@ def inference_stream( if is_end or (stream_chunk_size > 0 and len(last_tokens) >= stream_chunk_size): gpt_latents = torch.cat(all_latents, dim=0)[None, :] + if length_scale != 1.0: + gpt_latents = F.interpolate( + gpt_latents.transpose(1, 2), + scale_factor=length_scale, + mode="linear" + ).transpose(1, 2) wav_gen = self.hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device)) wav_chunk, wav_gen_prev, wav_overlap = self.handle_chunks( wav_gen.squeeze(), wav_gen_prev, wav_overlap, overlap_wav_len diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py index d1c6b67c39..a5aad5c1ea 100644 --- a/tests/zoo_tests/test_models.py +++ b/tests/zoo_tests/test_models.py @@ -111,7 +111,7 @@ def test_xtts_streaming(): model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) print("Computing speaker latents...") - gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav) + gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav) print("Inference...") chunks = model.inference_stream( @@ -139,7 +139,7 @@ def test_xtts_v2(): "yes | " f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 " f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True ' - f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}" "--language_idx "en"' + f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}" --language_idx "en"' ) else: run_cli( @@ -164,7 +164,7 @@ def test_xtts_v2_streaming(): model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) print("Computing speaker latents...") - gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav) + gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav) print("Inference...") chunks = model.inference_stream( @@ -179,6 +179,34 @@ def test_xtts_v2_streaming(): assert chunk.shape[-1] > 5000 wav_chuncks.append(chunk) assert len(wav_chuncks) > 1 + normal_len = sum([len(chunk) for chunk in wav_chuncks]) + + chunks = model.inference_stream( + "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + "en", + gpt_cond_latent, + speaker_embedding, + speed=1.5 + ) + wav_chuncks = [] + for i, chunk in enumerate(chunks): + wav_chuncks.append(chunk) + fast_len = sum([len(chunk) for chunk in wav_chuncks]) + + chunks = model.inference_stream( + "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + "en", + gpt_cond_latent, + speaker_embedding, + speed=0.66 + ) + wav_chuncks = [] + for i, chunk in enumerate(chunks): + wav_chuncks.append(chunk) + slow_len = sum([len(chunk) for chunk in wav_chuncks]) + + assert slow_len > normal_len + assert normal_len > fast_len def test_tortoise(): From 15f0ac57d6786e3b171df7a21ccce78d2000d31f Mon Sep 17 00:00:00 2001 From: Ikko Eltociear Ashimine Date: Wed, 15 Nov 2023 21:59:56 +0900 Subject: [PATCH 2/4] Update README.md (#3215) Dicord -> Discord --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 935627e588..4e5855f9ab 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ 📚 Utilities for dataset analysis and curation. ______________________________________________________________________ -[![Dicord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv) +[![Discord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv) [![License]()](https://opensource.org/licenses/MPL-2.0) [![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS) [![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md) From 73a5bd08c0593feb135ad229ad628d6f85898ec0 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Wed, 15 Nov 2023 10:02:05 -0300 Subject: [PATCH 3/4] Fix XTTS GPT padding and inference issues (#3216) * Fix end artifact for fine tuning models * Bug fix on zh-cn inference * Remove ununsed code --- TTS/tts/layers/xtts/gpt.py | 11 +---------- TTS/tts/layers/xtts/tokenizer.py | 12 ++++++------ TTS/tts/models/xtts.py | 31 ------------------------------- 3 files changed, 7 insertions(+), 47 deletions(-) diff --git a/TTS/tts/layers/xtts/gpt.py b/TTS/tts/layers/xtts/gpt.py index 612da260ef..d914ebf90f 100644 --- a/TTS/tts/layers/xtts/gpt.py +++ b/TTS/tts/layers/xtts/gpt.py @@ -426,15 +426,6 @@ def forward( if max_mel_len > audio_codes.shape[-1]: audio_codes = F.pad(audio_codes, (0, max_mel_len - audio_codes.shape[-1])) - silence = True - for idx, l in enumerate(code_lengths): - length = l.item() - while silence: - if audio_codes[idx, length - 1] != 83: - break - length -= 1 - code_lengths[idx] = length - # 💖 Lovely assertions assert ( max_mel_len <= audio_codes.shape[-1] @@ -450,7 +441,7 @@ def forward( audio_codes = F.pad(audio_codes[:, :max_mel_len], (0, 1), value=self.stop_audio_token) # Pad mel codes with stop_audio_token - audio_codes = self.set_mel_padding(audio_codes, code_lengths) + audio_codes = self.set_mel_padding(audio_codes, code_lengths - 3) # -3 to get the real code lengths without consider start and stop tokens that was not added yet # Build input and target tensors # Prepend start token to inputs and append stop token to targets diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py index 211d0a93d9..7726d829ac 100644 --- a/TTS/tts/layers/xtts/tokenizer.py +++ b/TTS/tts/layers/xtts/tokenizer.py @@ -115,7 +115,7 @@ # There are not many common abbreviations in Arabic as in English. ] ], - "zh": [ + "zh-cn": [ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [ # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts. @@ -280,7 +280,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ("°", " درجة "), ] ], - "zh": [ + "zh-cn": [ # Chinese (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) for x in [ @@ -571,7 +571,7 @@ def check_input_length(self, txt, lang): ) def preprocess_text(self, txt, lang): - if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "zh-cn"}: + if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh-cn", "zh-cn"}: txt = multilingual_cleaners(txt, lang) if lang in {"zh", "zh-cn"}: txt = chinese_transliterate(txt) @@ -682,8 +682,8 @@ def test_expand_numbers_multilingual(): ("Dat wordt dan $20 meneer.", "Dat wordt dan twintig dollar meneer.", "nl"), ("Dat wordt dan 20€ meneer.", "Dat wordt dan twintig euro meneer.", "nl"), # Chinese (Simplified) - ("在12.5秒内", "在十二点五秒内", "zh"), - ("有50名士兵", "有五十名士兵", "zh"), + ("在12.5秒内", "在十二点五秒内", "zh-cn"), + ("有50名士兵", "有五十名士兵", "zh-cn"), # ("那将是$20先生", '那将是二十美元先生', 'zh'), currency doesn't work # ("那将是20€先生", '那将是二十欧元先生', 'zh'), # Turkish @@ -764,7 +764,7 @@ def test_symbols_multilingual(): ("Ik heb 14% batterij", "Ik heb 14 procent batterij", "nl"), ("Ik zie je @ het feest", "Ik zie je bij het feest", "nl"), ("لدي 14% في البطارية", "لدي 14 في المئة في البطارية", "ar"), - ("我的电量为 14%", "我的电量为 14 百分之", "zh"), + ("我的电量为 14%", "我的电量为 14 百分之", "zh-cn"), ("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"), ("Az akkumulátorom töltöttsége 14%", "Az akkumulátorom töltöttsége 14 százalék", "hu"), ("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"), diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 9198591273..f37f08449d 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -7,7 +7,6 @@ import torchaudio from coqpit import Coqpit -from TTS.tts.layers.tortoise.audio_utils import wav_to_univnet_mel from TTS.tts.layers.xtts.gpt import GPT from TTS.tts.layers.xtts.hifigan_decoder import HifiDecoder from TTS.tts.layers.xtts.stream_generator import init_stream_support @@ -308,26 +307,6 @@ def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int = cond_latent = self.gpt.get_style_emb(mel.to(self.device)) return cond_latent.transpose(1, 2) - @torch.inference_mode() - def get_diffusion_cond_latents(self, audio, sr): - from math import ceil - - diffusion_conds = [] - CHUNK_SIZE = 102400 - audio_24k = torchaudio.functional.resample(audio, sr, 24000) - for chunk in range(ceil(audio_24k.shape[1] / CHUNK_SIZE)): - current_sample = audio_24k[:, chunk * CHUNK_SIZE : (chunk + 1) * CHUNK_SIZE] - current_sample = pad_or_truncate(current_sample, CHUNK_SIZE) - cond_mel = wav_to_univnet_mel( - current_sample.to(self.device), - do_normalization=False, - device=self.device, - ) - diffusion_conds.append(cond_mel) - diffusion_conds = torch.stack(diffusion_conds, dim=1) - diffusion_latent = self.diffusion_decoder.get_conditioning(diffusion_conds) - return diffusion_latent - @torch.inference_mode() def get_speaker_embedding(self, audio, sr): audio_16k = torchaudio.functional.resample(audio, sr, 16000) @@ -575,16 +554,6 @@ def inference( return_attentions=False, return_latent=True, ) - silence_token = 83 - ctokens = 0 - for k in range(gpt_codes.shape[-1]): - if gpt_codes[0, k] == silence_token: - ctokens += 1 - else: - ctokens = 0 - if ctokens > 8: - gpt_latents = gpt_latents[:, :k] - break if length_scale != 1.0: gpt_latents = F.interpolate( From 88630c60e5c35be65f7e75b7a2a27a2b63ac87ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 15 Nov 2023 14:02:51 +0100 Subject: [PATCH 4/4] Update to v0.20.5 --- TTS/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/VERSION b/TTS/VERSION index 6dd46024a4..1b619f3482 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.20.4 +0.20.5