small bug fix for inference

a-froghyar · May 19, 2021 · 4aaf406 · 4aaf406
1 parent 6bced9e
commit 4aaf406
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 33 deletions.
diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py
@@ -679,12 +679,12 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch):
             print("WARNING: You didn't provide a gst style wav, for this reason we use a zero tensor!")
             for i in range(c.gst['gst_style_tokens']):
                 style_wav[str(i)] = 0
-        if reference_wav is None and c.use_capacitron:
-            reference_text = None
-            print("No reference wav has been defined, sampling from the prior of Capacitron.")
-        else:
-            # TODO this is not working
-            print("Infering prosody transfer from reference file {}.".format(reference_wav))
+        if c.use_capacitron:
+            if reference_wav is not None:
+                print("Infering prosody transfer from reference file {}.".format(reference_wav))
+            else:
+                reference_text = None
+                print("No reference wav has been defined, sampling from the prior of Capacitron.")
         for idx, test_sentence in enumerate(test_sentences):
             try:
                 wav, alignment, decoder_output, postnet_output, stop_tokens, _ = synthesis(

diff --git a/TTS/tts/configs/capacitron_blizzard.json b/TTS/tts/configs/capacitron_blizzard.json
@@ -1,7 +1,7 @@
 {
     "model": "Tacotron",
-    "run_name": "capacitron",
-    "run_description": "Capacitron-transpose",
+    "run_name": "capacitron-C=150-E=128",
+    "run_description": "Capacitron",
 
     // AUDIO PARAMETERS
     "audio": {
@@ -14,7 +14,7 @@
 
         // Audio processing parameters
         "sample_rate": 24000, // DATASET-RELATED: wav sample-rate.
-        "preemphasis": 0.97, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
         "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
 
         // Silence trimming
@@ -45,7 +45,7 @@
     "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
 
     // TRAINING
-    "batch_size": 128, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "batch_size": 256, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
     "eval_batch_size": 16,
     "r": 2, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
     "gradual_training": null,
@@ -71,24 +71,24 @@
 
     // VALIDATION
     "run_eval": true,
-    "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
+    "test_delay_epochs": 100, //Until attention is aligned, testing only wastes computation time.
     "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
 
     // OPTIMIZER
     "noam_schedule": false, // use noam warmup and lr schedule.
     "grad_clip": 5.0, // upper limit for gradients for clipping.
-    "epochs": 300000, // total number of epochs to train.
+    "epochs": 299, // total number of epochs to train.
     "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
     "wd": 0.000001, // Weight decay weight.
     "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
     "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
     "use_gradual_lr": true, // Hardcoded step-wise learning rate scheduling. Overrides noam schedule if noam is true
     "gradual_learning_rates": [
         [0, 1e-3],
-        [5e4, 5e-4], 
-        [1e5, 3e-4],
-        [15e4, 1e-4], 
-        [2e5, 5e-5]
+        [2e4, 5e-4], 
+        [4e5, 3e-4],
+        [6e4, 1e-4], 
+        [8e4, 5e-5]
     ],
 
     // TACOTRON PRENET
@@ -117,18 +117,19 @@
     "print_step": 100, // Number of steps to log training on console.
     "tb_plot_step": 100, // Number of steps to plot TB training figures.
     "print_eval": false, // If True, it prints intermediate loss values in evalulation.
-    "save_step": 1000, // Number of training steps expected to save traninpg stats and checkpoints.
+    // TODO increase this to 10K when the model is stable to save training time
+    "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints.
     "checkpoint": true, // If true, it saves checkpoints per "save_step"
     "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
 
     // DATA LOADING
     "text_cleaner": "phoneme_cleaners",
     "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
-    "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "num_loader_workers": 12, // number of training data loader processes. Don't set it too big. 4-8 are good values.
     "num_val_loader_workers": 8, // number of evaluation data loader processes.
     "batch_group_size": 4, //Number of batches to shuffle after bucketing.
     "min_seq_len": 1, // DATASET-RELATED: minimum text length to use in training
-    "max_seq_len": 153, // DATASET-RELATED: maximum text length
+    "max_seq_len": 110, // DATASET-RELATED: maximum text length
     "compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
     "use_noise_augment": true,
     // "add_silence_end_seconds": 0.25, //Nico - how much silence we add to the end of a sentence ->> sounds better when concatenating multiple synthesised utterances
@@ -163,7 +164,7 @@
         "capacitron_VAE_embedding_dim": 128, // Used for the output of the VAE encoder's LSTM size and for the posterior/prior distributions
         "capacitron_use_text_summary_embeddings": true,
         "capacitron_text_summary_embedding_dim": 128, // size for the LSTM accounting for the text summary conditional input
-        "capacitron_capacity": 300, // capacity target
+        "capacitron_capacity": 150, // capacity target
         "capacitron_use_speaker_embedding": false,
         "capacitron_reference_wav": "/home/big-boy/Data/blizzard2013/segmented/refs/seen/BLZ76427.wav", // path to reference wav file to be used in capacitron inference.
         "capacitron_reference_text": "What does the letter mean, mother?", // string of what the file above is saying

diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py
@@ -307,7 +307,7 @@ def inference(self, characters, speaker_ids=None, style_mel=None, reference_mel=
         if reference_text is not None:
             reference_text_embedding = self.embedding(reference_text)
             reference_text_length = torch.tensor([reference_text_embedding.size(1)], dtype=torch.int64) # pylint: disable=not-callable
-        reference_mel_length = torch.tensor([reference_mel.size(2)], dtype=torch.int64) if reference_mel is not None else None # pylint: disable=not-callable
+        reference_mel_length = torch.tensor([reference_mel.size(1)], dtype=torch.int64) if reference_mel is not None else None # pylint: disable=not-callable
 
         encoder_outputs = self.encoder(inputs)
         if self.gst:

diff --git a/eval.py b/eval.py
@@ -55,7 +55,7 @@ def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True, reference_info=
 
 now = datetime.datetime.now()
 
-RUN_NAME = 'capacitron-back-toEncoder-w-track_running_stats-May-12-2021_07+16AM-2840cb5'
+RUN_NAME = 'capacitron-noPreemphasis-256-LRCompressed-May-17-2021_03+27PM-fca955c'
 TEST_PATH = Path(join(r'/home/big-boy/Models/Blizzard/', RUN_NAME, 'TESTING'))
 CURRENT_TEST_PATH = Path(join(TEST_PATH, now.strftime("%Y-%m-%d %H:%M:%S")))
 TEST_PATH.mkdir(parents=True, exist_ok=True)
@@ -128,7 +128,7 @@ def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True, reference_info=
     "She had a habit of taking showers in lemonade."
 ]
 
-single_sentence = "When the president of Georgetown College was permitted to resign the office, he eagerly sailed to Europe."
+single_sentence = "Reality is the sum or aggregate of all that is real or existent within a system, as opposed to that which is only imaginary."
 
 SAMPLE_FROM = 'prior' # 'prior' or 'posterior'
 TEXT = 'single_sentence' # 'same_text' or 'sentences' or 'single_sentence'
@@ -166,6 +166,6 @@ def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True, reference_info=
     )
 
     file_handle = 'Prior' if (SAMPLE_FROM == 'prior') else 'Posterior'
-    file_id = _id if TEXT == 'single_sentence' or TEXT == 'same_text' else i
+    file_id = _id if TEXT == 'single_sentence' or TEXT == 'same_text' and SAMPLE_FROM != 'prior' else i
 
     ap.save_wav(wav, join(CURRENT_TEST_PATH, 'GMM_{}_{}.wav'.format(file_handle, file_id)))
diff --git a/playground.py b/playground.py
@@ -55,27 +55,36 @@ def extract_axis_1(data, ind):
 print(extract_axis_1(torch.tensor(matrix), to_extract))
 # %%
 
-x = torch.randn(128, 1, 151, 80)
+x = torch.randn(1, 1, 125, 4) # [batch_size, 1, time_dim, embed_dim]
 
 S = 2
 W = 1 # in channels
 Filter = 8 # out channels / filter size
 P = int(np.ceil(((S-1)*W-S+Filter)/2))
-x_pad = F.pad(x, (P//2, P//2, P//2, P//2))  # [left, right, top, bot]
+# x_pad = F.pad(x, (P//2, P//2, P//2, P//2))  # [left, right, top, bot]
+x_pad = F.pad(x, (1, 1, 1, 1))  # [left, right, top, bot]
+# print('x shape: ', x.shape)
+# print('x_pad shape: ', x_pad.shape)
+# print(x_pad[0, 0, :, 1])
 
-filters = [1] + [32, 32, 64, 64, 128, 128]
-print(x.shape)
+filters = [1] + [2, 2, 4, 4, 6, 6]
 
-valid_length = torch.tensor(x.size(3))
+valid_length = torch.tensor(100)
 for i in range(len(filters)-1):
     x = torch.nn.Conv2d(in_channels=filters[i],
                         out_channels=filters[i+1],
                         kernel_size=(3, 3),
                         stride=(2, 2),
-                        padding=(2, 2))(x)
-    valid_length = torch.ceil(valid_length/2) + 1
-    print('valid_length: ', valid_length)
-print(x.shape)
+                        padding=(1, 1))(x)
+    valid_length = torch.tensor([torch.ceil(valid_length/2)])
+    post_conv_max_width = x.size(2)
+    mask = torch.arange(post_conv_max_width).expand(1, post_conv_max_width) < valid_length.unsqueeze(1)
+    mask = mask.expand(1, 1, -1, -1).transpose(2, 0).transpose(-1, 2) # [batch_size, 1, post_conv_max_width, 1]
+    print('unmasked: ', x[0, 0, :, :])
+    print('_____________')
+    x = x*mask
+    print('masked: ', x[0, 0, :, :])
+    print('#############')
 
 
 # padded_output_shape = torch.nn.Conv2d(in_channels=1,