Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

v0.20.5 #3226

Merged
merged 4 commits into from
Nov 15, 2023
Merged

v0.20.5 #3226

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
📚 Utilities for dataset analysis and curation.
______________________________________________________________________

[![Dicord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv)
[![Discord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv)
[![License](<https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg>)](https://opensource.org/licenses/MPL-2.0)
[![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS)
[![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md)
Expand Down
2 changes: 1 addition & 1 deletion TTS/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.20.4
0.20.5
11 changes: 1 addition & 10 deletions TTS/tts/layers/xtts/gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,15 +426,6 @@ def forward(
if max_mel_len > audio_codes.shape[-1]:
audio_codes = F.pad(audio_codes, (0, max_mel_len - audio_codes.shape[-1]))

silence = True
for idx, l in enumerate(code_lengths):
length = l.item()
while silence:
if audio_codes[idx, length - 1] != 83:
break
length -= 1
code_lengths[idx] = length

# 💖 Lovely assertions
assert (
max_mel_len <= audio_codes.shape[-1]
Expand All @@ -450,7 +441,7 @@ def forward(
audio_codes = F.pad(audio_codes[:, :max_mel_len], (0, 1), value=self.stop_audio_token)

# Pad mel codes with stop_audio_token
audio_codes = self.set_mel_padding(audio_codes, code_lengths)
audio_codes = self.set_mel_padding(audio_codes, code_lengths - 3) # -3 to get the real code lengths without consider start and stop tokens that was not added yet

# Build input and target tensors
# Prepend start token to inputs and append stop token to targets
Expand Down
12 changes: 6 additions & 6 deletions TTS/tts/layers/xtts/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@
# There are not many common abbreviations in Arabic as in English.
]
],
"zh": [
"zh-cn": [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
# Chinese doesn't typically use abbreviations in the same way as Latin-based scripts.
Expand Down Expand Up @@ -280,7 +280,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
("°", " درجة "),
]
],
"zh": [
"zh-cn": [
# Chinese
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
for x in [
Expand Down Expand Up @@ -571,7 +571,7 @@ def check_input_length(self, txt, lang):
)

def preprocess_text(self, txt, lang):
if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "zh-cn"}:
if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh-cn", "zh-cn"}:
txt = multilingual_cleaners(txt, lang)
if lang in {"zh", "zh-cn"}:
txt = chinese_transliterate(txt)
Expand Down Expand Up @@ -682,8 +682,8 @@ def test_expand_numbers_multilingual():
("Dat wordt dan $20 meneer.", "Dat wordt dan twintig dollar meneer.", "nl"),
("Dat wordt dan 20€ meneer.", "Dat wordt dan twintig euro meneer.", "nl"),
# Chinese (Simplified)
("在12.5秒内", "在十二点五秒内", "zh"),
("有50名士兵", "有五十名士兵", "zh"),
("在12.5秒内", "在十二点五秒内", "zh-cn"),
("有50名士兵", "有五十名士兵", "zh-cn"),
# ("那将是$20先生", '那将是二十美元先生', 'zh'), currency doesn't work
# ("那将是20€先生", '那将是二十欧元先生', 'zh'),
# Turkish
Expand Down Expand Up @@ -764,7 +764,7 @@ def test_symbols_multilingual():
("Ik heb 14% batterij", "Ik heb 14 procent batterij", "nl"),
("Ik zie je @ het feest", "Ik zie je bij het feest", "nl"),
("لدي 14% في البطارية", "لدي 14 في المئة في البطارية", "ar"),
("我的电量为 14%", "我的电量为 14 百分之", "zh"),
("我的电量为 14%", "我的电量为 14 百分之", "zh-cn"),
("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"),
("Az akkumulátorom töltöttsége 14%", "Az akkumulátorom töltöttsége 14 százalék", "hu"),
("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"),
Expand Down
48 changes: 17 additions & 31 deletions TTS/tts/models/xtts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import torchaudio
from coqpit import Coqpit

from TTS.tts.layers.tortoise.audio_utils import wav_to_univnet_mel
from TTS.tts.layers.xtts.gpt import GPT
from TTS.tts.layers.xtts.hifigan_decoder import HifiDecoder
from TTS.tts.layers.xtts.stream_generator import init_stream_support
Expand Down Expand Up @@ -308,26 +307,6 @@ def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int =
cond_latent = self.gpt.get_style_emb(mel.to(self.device))
return cond_latent.transpose(1, 2)

@torch.inference_mode()
def get_diffusion_cond_latents(self, audio, sr):
from math import ceil

diffusion_conds = []
CHUNK_SIZE = 102400
audio_24k = torchaudio.functional.resample(audio, sr, 24000)
for chunk in range(ceil(audio_24k.shape[1] / CHUNK_SIZE)):
current_sample = audio_24k[:, chunk * CHUNK_SIZE : (chunk + 1) * CHUNK_SIZE]
current_sample = pad_or_truncate(current_sample, CHUNK_SIZE)
cond_mel = wav_to_univnet_mel(
current_sample.to(self.device),
do_normalization=False,
device=self.device,
)
diffusion_conds.append(cond_mel)
diffusion_conds = torch.stack(diffusion_conds, dim=1)
diffusion_latent = self.diffusion_decoder.get_conditioning(diffusion_conds)
return diffusion_latent

@torch.inference_mode()
def get_speaker_embedding(self, audio, sr):
audio_16k = torchaudio.functional.resample(audio, sr, 16000)
Expand Down Expand Up @@ -530,8 +509,10 @@ def inference(
top_p=0.85,
do_sample=True,
num_beams=1,
speed=1.0,
**hf_generate_kwargs,
):
length_scale = 1.0 / max(speed, 0.05)
text = text.strip().lower()
text_tokens = torch.IntTensor(self.tokenizer.encode(text, lang=language)).unsqueeze(0).to(self.device)

Expand Down Expand Up @@ -573,16 +554,13 @@ def inference(
return_attentions=False,
return_latent=True,
)
silence_token = 83
ctokens = 0
for k in range(gpt_codes.shape[-1]):
if gpt_codes[0, k] == silence_token:
ctokens += 1
else:
ctokens = 0
if ctokens > 8:
gpt_latents = gpt_latents[:, :k]
break

if length_scale != 1.0:
gpt_latents = F.interpolate(
gpt_latents.transpose(1, 2),
scale_factor=length_scale,
mode="linear"
).transpose(1, 2)

wav = self.hifigan_decoder(gpt_latents, g=speaker_embedding)

Expand Down Expand Up @@ -634,8 +612,10 @@ def inference_stream(
top_k=50,
top_p=0.85,
do_sample=True,
speed=1.0,
**hf_generate_kwargs,
):
length_scale = 1.0 / max(speed, 0.05)
text = text.strip().lower()
text_tokens = torch.IntTensor(self.tokenizer.encode(text, lang=language)).unsqueeze(0).to(self.device)

Expand Down Expand Up @@ -674,6 +654,12 @@ def inference_stream(

if is_end or (stream_chunk_size > 0 and len(last_tokens) >= stream_chunk_size):
gpt_latents = torch.cat(all_latents, dim=0)[None, :]
if length_scale != 1.0:
gpt_latents = F.interpolate(
gpt_latents.transpose(1, 2),
scale_factor=length_scale,
mode="linear"
).transpose(1, 2)
wav_gen = self.hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device))
wav_chunk, wav_gen_prev, wav_overlap = self.handle_chunks(
wav_gen.squeeze(), wav_gen_prev, wav_overlap, overlap_wav_len
Expand Down
34 changes: 31 additions & 3 deletions tests/zoo_tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def test_xtts_streaming():
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

print("Computing speaker latents...")
gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)

print("Inference...")
chunks = model.inference_stream(
Expand Down Expand Up @@ -139,7 +139,7 @@ def test_xtts_v2():
"yes | "
f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 "
f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True '
f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}" "--language_idx "en"'
f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}" --language_idx "en"'
)
else:
run_cli(
Expand All @@ -164,7 +164,7 @@ def test_xtts_v2_streaming():
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

print("Computing speaker latents...")
gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)

print("Inference...")
chunks = model.inference_stream(
Expand All @@ -179,6 +179,34 @@ def test_xtts_v2_streaming():
assert chunk.shape[-1] > 5000
wav_chuncks.append(chunk)
assert len(wav_chuncks) > 1
normal_len = sum([len(chunk) for chunk in wav_chuncks])

chunks = model.inference_stream(
"It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
"en",
gpt_cond_latent,
speaker_embedding,
speed=1.5
)
wav_chuncks = []
for i, chunk in enumerate(chunks):
wav_chuncks.append(chunk)
fast_len = sum([len(chunk) for chunk in wav_chuncks])

chunks = model.inference_stream(
"It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
"en",
gpt_cond_latent,
speaker_embedding,
speed=0.66
)
wav_chuncks = []
for i, chunk in enumerate(chunks):
wav_chuncks.append(chunk)
slow_len = sum([len(chunk) for chunk in wav_chuncks])

assert slow_len > normal_len
assert normal_len > fast_len


def test_tortoise():
Expand Down
Loading