From df896301ca88c31b2e5765c64c59225bd16273e1 Mon Sep 17 00:00:00 2001 From: Shivam Mehta Date: Fri, 1 Dec 2023 10:44:49 +0000 Subject: [PATCH 01/10] Minor changes moving option to disable prior loss in config --- configs/experiment/ljspeech_no_prior_loss.yaml | 17 +++++++++++++++++ configs/model/matcha.yaml | 1 + matcha/models/matcha_tts.py | 9 +++++++-- 3 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 configs/experiment/ljspeech_no_prior_loss.yaml diff --git a/configs/experiment/ljspeech_no_prior_loss.yaml b/configs/experiment/ljspeech_no_prior_loss.yaml new file mode 100644 index 0000000..6181950 --- /dev/null +++ b/configs/experiment/ljspeech_no_prior_loss.yaml @@ -0,0 +1,17 @@ +# @package _global_ + +# to execute this experiment run: +# python train.py experiment=multispeaker + +defaults: + - override /data: ljspeech.yaml + +# all parameters below will be merged with parameters from default configurations set above +# this allows you to overwrite only specified parameters + +tags: ["ljspeech"] + +run_name: ljspeech + +model: + prior_loss: false diff --git a/configs/model/matcha.yaml b/configs/model/matcha.yaml index 4700855..36f6eaf 100644 --- a/configs/model/matcha.yaml +++ b/configs/model/matcha.yaml @@ -12,3 +12,4 @@ spk_emb_dim: 64 n_feats: 80 data_statistics: ${data.data_statistics} out_size: null # Must be divisible by 4 +prior_loss: true diff --git a/matcha/models/matcha_tts.py b/matcha/models/matcha_tts.py index 6feb9e7..64b2c07 100644 --- a/matcha/models/matcha_tts.py +++ b/matcha/models/matcha_tts.py @@ -34,6 +34,7 @@ def __init__( out_size, optimizer=None, scheduler=None, + prior_loss=True, ): super().__init__() @@ -44,6 +45,7 @@ def __init__( self.spk_emb_dim = spk_emb_dim self.n_feats = n_feats self.out_size = out_size + self.prior_loss = prior_loss if n_spks > 1: self.spk_emb = torch.nn.Embedding(n_spks, spk_emb_dim) @@ -228,7 +230,10 @@ def forward(self, x, x_lengths, y, y_lengths, spks=None, out_size=None, cond=Non # Compute loss of the decoder diff_loss, _ = self.decoder.compute_loss(x1=y, mask=y_mask, mu=mu_y, spks=spks, cond=cond) - prior_loss = torch.sum(0.5 * ((y - mu_y) ** 2 + math.log(2 * math.pi)) * y_mask) - prior_loss = prior_loss / (torch.sum(y_mask) * self.n_feats) + if self.prior_loss: + prior_loss = torch.sum(0.5 * ((y - mu_y) ** 2 + math.log(2 * math.pi)) * y_mask) + prior_loss = prior_loss / (torch.sum(y_mask) * self.n_feats) + else: + prior_loss = 0 return dur_loss, prior_loss, diff_loss From 263d5c4d4ea23da432e6a6b5e5e19f1da8e00a45 Mon Sep 17 00:00:00 2001 From: Shivam Mehta Date: Fri, 1 Dec 2023 12:06:26 +0000 Subject: [PATCH 02/10] Adding piper phonemizer with different dataset --- configs/data/hi-fi_en-US_female.yaml | 13 +++++++++++++ .../experiment/hifi_dataset_piper_phonemizer.yaml | 14 ++++++++++++++ matcha/text/cleaners.py | 11 +++++++++++ requirements.txt | 1 + 4 files changed, 39 insertions(+) create mode 100644 configs/data/hi-fi_en-US_female.yaml create mode 100644 configs/experiment/hifi_dataset_piper_phonemizer.yaml diff --git a/configs/data/hi-fi_en-US_female.yaml b/configs/data/hi-fi_en-US_female.yaml new file mode 100644 index 0000000..2a95cda --- /dev/null +++ b/configs/data/hi-fi_en-US_female.yaml @@ -0,0 +1,13 @@ +defaults: + - ljspeech + - _self_ + +_target_: matcha.data.text_mel_datamodule.TextMelDataModule +name: hi-fi_en-US_female +train_filelist_path: data/filelists/hi-fi-captain-en-us-female_train.txt +valid_filelist_path: data/filelists/hi-fi-captain-en-us-female_val.txt +batch_size: 32 +cleaners: [english_cleaners_piper] +data_statistics: # Computed for vctk dataset + mel_mean: -6.38385 + mel_std: 2.541796 diff --git a/configs/experiment/hifi_dataset_piper_phonemizer.yaml b/configs/experiment/hifi_dataset_piper_phonemizer.yaml new file mode 100644 index 0000000..7e6c57a --- /dev/null +++ b/configs/experiment/hifi_dataset_piper_phonemizer.yaml @@ -0,0 +1,14 @@ +# @package _global_ + +# to execute this experiment run: +# python train.py experiment=multispeaker + +defaults: + - override /data: hi-fi_en-US_female.yaml + +# all parameters below will be merged with parameters from default configurations set above +# this allows you to overwrite only specified parameters + +tags: ["hi-fi", "single_speaker", "piper_phonemizer", "en_US", "female"] + +run_name: hi-fi_en-US_female_piper_phonemizer diff --git a/matcha/text/cleaners.py b/matcha/text/cleaners.py index 26b91d7..5e8d96b 100644 --- a/matcha/text/cleaners.py +++ b/matcha/text/cleaners.py @@ -15,6 +15,7 @@ import re import phonemizer +import piper_phonemize from unidecode import unidecode # To avoid excessive logging we set the log level of the phonemizer package to Critical @@ -103,3 +104,13 @@ def english_cleaners2(text): phonemes = global_phonemizer.phonemize([text], strip=True, njobs=1)[0] phonemes = collapse_whitespace(phonemes) return phonemes + + +def english_cleaners_piper(text): + """Pipeline for English text, including abbreviation expansion. + punctuation + stress""" + text = convert_to_ascii(text) + text = lowercase(text) + text = expand_abbreviations(text) + phonemes = "".join(piper_phonemize.phonemize_espeak(text=text, voice="en-US")[0]) + phonemes = collapse_whitespace(phonemes) + return phonemes diff --git a/requirements.txt b/requirements.txt index c1be781..f657dc1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -42,3 +42,4 @@ gradio gdown wget seaborn +piper_phonemize From a18db173302052cd0bfff1fe8f70ef7b58ae87be Mon Sep 17 00:00:00 2001 From: Shivam Mehta Date: Mon, 4 Dec 2023 10:12:39 +0000 Subject: [PATCH 03/10] Removing the option for configuring prior loss, the durations predicted are not so good then --- configs/model/matcha.yaml | 1 - matcha/models/matcha_tts.py | 9 ++------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/configs/model/matcha.yaml b/configs/model/matcha.yaml index 36f6eaf..4700855 100644 --- a/configs/model/matcha.yaml +++ b/configs/model/matcha.yaml @@ -12,4 +12,3 @@ spk_emb_dim: 64 n_feats: 80 data_statistics: ${data.data_statistics} out_size: null # Must be divisible by 4 -prior_loss: true diff --git a/matcha/models/matcha_tts.py b/matcha/models/matcha_tts.py index 64b2c07..6feb9e7 100644 --- a/matcha/models/matcha_tts.py +++ b/matcha/models/matcha_tts.py @@ -34,7 +34,6 @@ def __init__( out_size, optimizer=None, scheduler=None, - prior_loss=True, ): super().__init__() @@ -45,7 +44,6 @@ def __init__( self.spk_emb_dim = spk_emb_dim self.n_feats = n_feats self.out_size = out_size - self.prior_loss = prior_loss if n_spks > 1: self.spk_emb = torch.nn.Embedding(n_spks, spk_emb_dim) @@ -230,10 +228,7 @@ def forward(self, x, x_lengths, y, y_lengths, spks=None, out_size=None, cond=Non # Compute loss of the decoder diff_loss, _ = self.decoder.compute_loss(x1=y, mask=y_mask, mu=mu_y, spks=spks, cond=cond) - if self.prior_loss: - prior_loss = torch.sum(0.5 * ((y - mu_y) ** 2 + math.log(2 * math.pi)) * y_mask) - prior_loss = prior_loss / (torch.sum(y_mask) * self.n_feats) - else: - prior_loss = 0 + prior_loss = torch.sum(0.5 * ((y - mu_y) ** 2 + math.log(2 * math.pi)) * y_mask) + prior_loss = prior_loss / (torch.sum(y_mask) * self.n_feats) return dur_loss, prior_loss, diff_loss From 009b09a8b2ff5922e076dd4892be8a3ce5b95e3e Mon Sep 17 00:00:00 2001 From: Shivam Mehta Date: Mon, 4 Dec 2023 10:13:44 +0000 Subject: [PATCH 04/10] Removing unwanted configs --- configs/experiment/ljspeech_no_prior_loss.yaml | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 configs/experiment/ljspeech_no_prior_loss.yaml diff --git a/configs/experiment/ljspeech_no_prior_loss.yaml b/configs/experiment/ljspeech_no_prior_loss.yaml deleted file mode 100644 index 6181950..0000000 --- a/configs/experiment/ljspeech_no_prior_loss.yaml +++ /dev/null @@ -1,17 +0,0 @@ -# @package _global_ - -# to execute this experiment run: -# python train.py experiment=multispeaker - -defaults: - - override /data: ljspeech.yaml - -# all parameters below will be merged with parameters from default configurations set above -# this allows you to overwrite only specified parameters - -tags: ["ljspeech"] - -run_name: ljspeech - -model: - prior_loss: false From 6c7a82a51651370b562eb9f750b7f9a087cac293 Mon Sep 17 00:00:00 2001 From: Shivam Mehta Date: Mon, 4 Dec 2023 10:15:13 +0000 Subject: [PATCH 05/10] Adding dataset information --- configs/data/hi-fi_en-US_female.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/configs/data/hi-fi_en-US_female.yaml b/configs/data/hi-fi_en-US_female.yaml index 2a95cda..1269f9b 100644 --- a/configs/data/hi-fi_en-US_female.yaml +++ b/configs/data/hi-fi_en-US_female.yaml @@ -2,12 +2,13 @@ defaults: - ljspeech - _self_ +# Dataset URL: https://ast-astrec.nict.go.jp/en/release/hi-fi-captain/ _target_: matcha.data.text_mel_datamodule.TextMelDataModule name: hi-fi_en-US_female train_filelist_path: data/filelists/hi-fi-captain-en-us-female_train.txt valid_filelist_path: data/filelists/hi-fi-captain-en-us-female_val.txt batch_size: 32 cleaners: [english_cleaners_piper] -data_statistics: # Computed for vctk dataset +data_statistics: # Computed for this dataset mel_mean: -6.38385 mel_std: 2.541796 From 6e71dc8b8fb21bc5b4d12019a88ef81c2c387422 Mon Sep 17 00:00:00 2001 From: Shivam Mehta Date: Tue, 5 Dec 2023 09:57:37 +0000 Subject: [PATCH 06/10] adding prior loss as a configuration --- configs/model/matcha.yaml | 1 + matcha/models/matcha_tts.py | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/configs/model/matcha.yaml b/configs/model/matcha.yaml index 4700855..36f6eaf 100644 --- a/configs/model/matcha.yaml +++ b/configs/model/matcha.yaml @@ -12,3 +12,4 @@ spk_emb_dim: 64 n_feats: 80 data_statistics: ${data.data_statistics} out_size: null # Must be divisible by 4 +prior_loss: true diff --git a/matcha/models/matcha_tts.py b/matcha/models/matcha_tts.py index 6feb9e7..64b2c07 100644 --- a/matcha/models/matcha_tts.py +++ b/matcha/models/matcha_tts.py @@ -34,6 +34,7 @@ def __init__( out_size, optimizer=None, scheduler=None, + prior_loss=True, ): super().__init__() @@ -44,6 +45,7 @@ def __init__( self.spk_emb_dim = spk_emb_dim self.n_feats = n_feats self.out_size = out_size + self.prior_loss = prior_loss if n_spks > 1: self.spk_emb = torch.nn.Embedding(n_spks, spk_emb_dim) @@ -228,7 +230,10 @@ def forward(self, x, x_lengths, y, y_lengths, spks=None, out_size=None, cond=Non # Compute loss of the decoder diff_loss, _ = self.decoder.compute_loss(x1=y, mask=y_mask, mu=mu_y, spks=spks, cond=cond) - prior_loss = torch.sum(0.5 * ((y - mu_y) ** 2 + math.log(2 * math.pi)) * y_mask) - prior_loss = prior_loss / (torch.sum(y_mask) * self.n_feats) + if self.prior_loss: + prior_loss = torch.sum(0.5 * ((y - mu_y) ** 2 + math.log(2 * math.pi)) * y_mask) + prior_loss = prior_loss / (torch.sum(y_mask) * self.n_feats) + else: + prior_loss = 0 return dur_loss, prior_loss, diff_loss From f39ee6cf3be349ba4b239c065b93e1e6398096f6 Mon Sep 17 00:00:00 2001 From: Shivam Mehta Date: Tue, 5 Dec 2023 12:10:52 +0000 Subject: [PATCH 07/10] Changing while to for for more readibility --- matcha/models/components/flow_matching.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/matcha/models/components/flow_matching.py b/matcha/models/components/flow_matching.py index 4d77547..5cad743 100644 --- a/matcha/models/components/flow_matching.py +++ b/matcha/models/components/flow_matching.py @@ -73,16 +73,14 @@ def solve_euler(self, x, t_span, mu, mask, spks, cond): # Or in future might add like a return_all_steps flag sol = [] - steps = 1 - while steps <= len(t_span) - 1: + for step in range(1, len(t_span)): dphi_dt = self.estimator(x, mask, mu, t, spks, cond) x = x + dt * dphi_dt t = t + dt sol.append(x) - if steps < len(t_span) - 1: - dt = t_span[steps + 1] - t - steps += 1 + if step < len(t_span) - 1: + dt = t_span[step + 1] - t return sol[-1] From 0ed9290c312d125ff73876056703df32a48c748a Mon Sep 17 00:00:00 2001 From: Shivam Mehta Date: Wed, 6 Dec 2023 10:39:54 +0000 Subject: [PATCH 08/10] Logging global step while training --- matcha/models/baselightningmodule.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/matcha/models/baselightningmodule.py b/matcha/models/baselightningmodule.py index 29f4927..3724888 100644 --- a/matcha/models/baselightningmodule.py +++ b/matcha/models/baselightningmodule.py @@ -81,7 +81,7 @@ def training_step(self, batch: Any, batch_idx: int): "step", float(self.global_step), on_step=True, - on_epoch=True, + prog_bar=True, logger=True, sync_dist=True, ) From 254a8e05ce140502d7cbab34695f9ea9b27ea6dc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 28 Dec 2023 13:20:11 +0000 Subject: [PATCH 09/10] Bump diffusers from 0.21.3 to 0.25.0 Bumps [diffusers](https://github.com/huggingface/diffusers) from 0.21.3 to 0.25.0. - [Release notes](https://github.com/huggingface/diffusers/releases) - [Commits](https://github.com/huggingface/diffusers/compare/v0.21.3...v0.25.0) --- updated-dependencies: - dependency-name: diffusers dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f657dc1..0a7e14c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,7 +35,7 @@ torchaudio matplotlib pandas conformer==0.3.2 -diffusers==0.21.3 +diffusers==0.25.0 notebook ipywidgets gradio From 95ec24b5992d9e4b23cbcc62c846042c3a37542c Mon Sep 17 00:00:00 2001 From: Shivam Mehta Date: Fri, 12 Jan 2024 10:48:52 +0000 Subject: [PATCH 10/10] Version bump --- matcha/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/matcha/VERSION b/matcha/VERSION index 81340c7..bbdeab6 100644 --- a/matcha/VERSION +++ b/matcha/VERSION @@ -1 +1 @@ -0.0.4 +0.0.5