mac eval and melgan config

a-froghyar · May 23, 2021 · 64cff14 · 64cff14
1 parent 4aaf406
commit 64cff14
Show file tree

Hide file tree

Showing 2 changed files with 340 additions and 0 deletions.
diff --git a/TTS/vocoder/configs/thorsten_melgan_config.json b/TTS/vocoder/configs/thorsten_melgan_config.json
@@ -0,0 +1,170 @@
+{
+    "github_branch":"* main",
+    "restore_path":"/home/thorsten/___prj/tts/models/vocoder/fullband-melgan-main/output/thorsten-dca-fullband-melgan-main-branch-May-11-2021_02+26PM-0ee3eee/checkpoint_500000.pth.tar",
+    "github_branch":"* main",
+    "restore_path":"/home/thorsten/___prj/tts/models/vocoder/fullband-melgan-main/output/thorsten-dca-fullband-melgan-main-branch-May-06-2021_10+34AM-0ee3eee/checkpoint_400000.pth.tar",
+    "github_branch":"* main",
+    "restore_path":"/home/thorsten/___prj/tts/models/vocoder/fullband-melgan-main/output/thorsten-dca-fullband-melgan-main-branch-May-01-2021_06+07AM-0ee3eee/checkpoint_300000.pth.tar",
+    "github_branch":"* main",
+    "restore_path":"/home/thorsten/___prj/tts/models/vocoder/fullband-melgan-main/output/thorsten-dca-fullband-melgan-main-branch-April-28-2021_10+55PM-0ee3eee/checkpoint_200000.pth.tar",
+    "github_branch":"* main",
+    "restore_path":"/home/thorsten/___prj/tts/models/vocoder/fullband-melgan-main/output/thorsten-dca-fullband-melgan-main-branch-April-26-2021_03+42PM-0ee3eee/checkpoint_100000.pth.tar",
+    "github_branch":"* main",
+    "github_branch":"* main",
+        "run_name": "thorsten-dca-fullband-melgan-main-branch",
+        "run_description": "FullBand-MelGAN Vocoder on thorsten v03 vocoder dataset. Commit id 0ee3eeefb553678d56c49534f3972a426a254649",
+
+        // AUDIO PARAMETERS
+        "audio":{
+            "fft_size": 1024,         // number of stft frequency levels. Size of the linear spectogram frame.
+            "win_length": 1024,      // stft window length in ms.
+            "hop_length": 256,       // stft window hop-lengh in ms.
+            "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
+            "frame_shift_ms": null,  // stft window hop-lengh in ms. If null, 'hop_length' is used.
+
+            // Audio processing parameters
+            "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
+            "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+            "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
+
+            // Silence trimming
+            "do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+            "trim_db": 50,          // threshold for timming silence. Set this according to your dataset.
+
+            // MelSpectrogram parameters
+            "num_mels": 80,         // size of the mel spec frame.
+            "mel_fmin": 50.0,        // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+            "mel_fmax": 8000.0,     // maximum freq level for mel-spec. Tune for dataset!!
+            "spec_gain": 1.0,         // scaler value appplied after log transform of spectrogram.
+
+            // Normalization parameters
+            "signal_norm": true,    // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
+            "min_level_db": -100,   // lower bound for normalization
+            "symmetric_norm": true, // move normalization to range [-1, 1]
+            "max_norm": 4.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+            "clip_norm": true,      // clip normalized values into the range.
+            "stats_path": "/home/thorsten/___prj/tts/models/taco2/thorsten-dca/spec-stats.npy"    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+        },
+
+        // DISTRIBUTED TRAINING
+        // "distributed":{
+        //     "backend": "nccl",
+        //     "url": "tcp:\/\/localhost:54321"
+        // },
+
+        // MODEL PARAMETERS
+        "use_pqmf": false,
+
+        // LOSS PARAMETERS
+        "use_stft_loss": true,
+        "use_subband_stft_loss": false,
+        "use_mse_gan_loss": true,
+        "use_hinge_gan_loss": false,
+        "use_feat_match_loss": true,  // use only with melgan discriminators
+        "use_l1_spec_loss": true,
+
+        // loss weights
+        "stft_loss_weight": 0,
+        "subband_stft_loss_weight": 0,
+        "mse_G_loss_weight": 1,
+        "hinge_G_loss_weight": 0,
+        "feat_match_loss_weight": 108,
+        "l1_spec_loss_weight": 45,
+
+        // multiscale stft loss parameters
+        "stft_loss_params": {
+            "n_ffts": [1024, 2048, 512],
+            "hop_lengths": [120, 240, 50],
+            "win_lengths": [600, 1200, 240]
+        },
+
+        "l1_spec_loss_params": {
+            "use_mel": true,
+            "sample_rate": 22050,
+            "n_fft": 1024,
+            "hop_length": 256,
+            "win_length": 1024,
+            "n_mels": 80,
+            "mel_fmin": 50.0,
+            "mel_fmax": 8000.0
+        },
+
+        "target_loss": "avg_G_loss",  // loss value to pick the best model to save after each epoch
+
+        // DISCRIMINATOR
+        "discriminator_model": "melgan_multiscale_discriminator",
+        "discriminator_model_params":{
+            "base_channels": 16,
+            "max_channels":512,
+            "downsample_factors":[4, 4, 4]
+        },
+        "steps_to_start_discriminator": 200000,      // steps required to start GAN trainining.1
+        "diff_samples_for_G_and_D": true,
+
+        // GENERATOR
+        "generator_model": "fullband_melgan_generator",
+        "generator_model_params": {
+            "upsample_factors":[8, 8, 4],
+            "num_res_blocks": 4
+        },
+
+        // DATASET
+        "data_path": "/home/thorsten/___prj/tts/datasets/thorsten-de_v03/wavs/",
+        "feature_path": "/home/thorsten/___prj/tts/datasets/thorsten-de_v03/features/",
+        "seq_len": 16384,
+        "pad_short": 2000,
+        "conv_pad": 0,
+        "use_noise_augment": false,
+        "use_cache": true,
+
+        "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
+
+        // TRAINING
+        "batch_size": 48,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+
+        // VALIDATION
+        "run_eval": true,
+        "test_delay_epochs": 10,  //Until attention is aligned, testing only wastes computation time.
+        "test_sentences_file": "/home/thorsten/___prj/tts/models/taco2/thorsten-dca/test_sentences.txt",  // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
+
+        // OPTIMIZER
+        "epochs": 10000,                // total number of epochs to train.
+        "wd": 0.0,                // Weight decay weight.
+        "gen_clip_grad": -1,      // Generator gradient clipping threshold. Apply gradient clipping if > 0
+        "disc_clip_grad": -1,     // Discriminator gradient clipping threshold.
+        "lr_scheduler_gen": "MultiStepLR",   // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+        "lr_scheduler_gen_params": {
+            "gamma": 0.5,
+            "milestones": [100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000]
+        },
+        "lr_scheduler_disc": "MultiStepLR",   // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+        "lr_scheduler_disc_params": {
+               "gamma": 0.5,
+              "milestones": [100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000]
+        },
+        "lr_gen": 0.000003125,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
+        "lr_disc": 0.000003125,
+        "optimizer": "AdamW",
+        "optimizer_params":{
+            "betas": [0.8, 0.99],
+            "weight_decay": 0.0
+        },
+
+        // TENSORBOARD and LOGGING
+        "print_step": 25,       // Number of steps to log traning on console.
+        "print_eval": false,     // If True, it prints loss values for each step in eval run.
+        "save_step": 25000,      // Number of training steps expected to plot training stats on TB and save model checkpoints.
+        "checkpoint": true,     // If true, it saves checkpoints per "save_step"
+        "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
+
+        // DATA LOADING
+        "num_loader_workers": 4,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+        "num_val_loader_workers": 4,    // number of evaluation data loader processes.
+        "eval_split_size": 10,
+
+        // PATHS
+        "output_path": "/home/thorsten/___prj/tts/models/vocoder/fullband-melgan-main/output"
+    }
+
+
+
diff --git a/eval_mac.py b/eval_mac.py
@@ -0,0 +1,170 @@
+from TTS.vocoder.utils.generic_utils import setup_generator
+import os
+import torch
+import time
+import pandas as pd
+from pathlib import Path
+from os.path import join
+import datetime
+
+from TTS.tts.utils.generic_utils import setup_model
+from TTS.utils.io import load_config
+from TTS.tts.utils.text.symbols import symbols, phonemes
+from TTS.utils.audio import AudioProcessor
+from TTS.tts.utils.synthesis import synthesis
+
+def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True, reference_info=None, style_wav=None):
+    t_1 = time.time()
+    reference_wav = reference_info[0] if reference_info is not None else None
+    reference_text = reference_info[1] if reference_info is not None else None
+    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(
+        model,
+        text,
+        CONFIG,
+        use_cuda,
+        ap,
+        speaker_id,
+        style_wav=style_wav,
+        truncated=False,
+        enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,
+        use_griffin_lim=use_gl,
+        reference_wav=reference_wav,
+        reference_text=reference_text
+    )
+    mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)
+    if not use_gl:
+        waveform = vocoder_model.inference(torch.FloatTensor(mel_spec.T).unsqueeze(0))
+        waveform = waveform.flatten()
+    if use_cuda:
+        waveform = waveform
+    # waveform = waveform.numpy()
+    rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)
+    tps = (time.time() - t_1) / len(waveform)
+    print(waveform.shape)
+    print(" > Run-time: {}".format(time.time() - t_1))
+    print(" > Real-time factor: {}".format(rtf))
+    print(" > Time per step: {}".format(tps))
+    return alignment, mel_postnet_spec, stop_tokens, waveform
+
+
+''' Runtime settings '''
+use_cuda = False
+
+''' Directory Mgmt '''
+
+now = datetime.datetime.now()
+
+RUN_NAME = '300_128'
+TEST_PATH = Path(join(r'/Users/adamfroghyar/Models/Blizzard/', RUN_NAME, 'TESTING'))
+CURRENT_TEST_PATH = Path(join(TEST_PATH, now.strftime("%Y-%m-%d %H:%M:%S")))
+TEST_PATH.mkdir(parents=True, exist_ok=True)
+
+CURRENT_TEST_PATH.mkdir(parents=True, exist_ok=True)
+
+# model paths
+TTS_MODEL = join(r'/Users/adamfroghyar/Models/Blizzard', RUN_NAME, 'best_model.pth.tar')
+TTS_CONFIG = join(r'/Users/adamfroghyar/Models/Blizzard', RUN_NAME, 'config.json')
+VOCODER_MODEL = "/Users/adamfroghyar/Models/BlizzardVocoder/WaveGrad/best_model.pth.tar"
+VOCODER_CONFIG = "/Users/adamfroghyar/Models/BlizzardVocoder/WaveGrad/config.json"
+
+# load configs
+TTS_CONFIG = load_config(TTS_CONFIG)
+# VOCODER_CONFIG = load_config(VOCODER_CONFIG)
+
+# load the audio processor
+# TTS_CONFIG.audio['stats_path'] = join(r'/home/big-boy/Models/Blizzard', 'blizzard-gts-March-17-2021_03+34PM-b4248b0', 'scale_stats.npy')
+
+ap = AudioProcessor(**TTS_CONFIG.audio)
+
+''' LOAD TTS MODEL '''
+
+# multi speaker
+speaker_id = None
+speakers = []
+
+# load the model
+num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)
+model = setup_model(num_chars, len(speakers), TTS_CONFIG)
+
+# load model state
+cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))
+
+# load the model
+model.load_state_dict(cp['model'])
+if use_cuda:
+    model.cuda()
+model.eval()
+
+# set model stepsize
+if 'r' in cp:
+    model.decoder.set_r(cp['r'])
+
+''' VOCODER '''
+# LOAD VOCODER MODEL
+# vocoder_model = setup_generator(VOCODER_CONFIG)
+# vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location="cpu")["model"])
+# vocoder_model.remove_weight_norm()
+# vocoder_model.inference_padding = 0
+
+# ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio'])
+# if use_cuda:
+#     vocoder_model.cuda()
+# vocoder_model.eval()
+
+sentences = [
+    "Sixty-Four comes asking for bread.",
+    "Two seats were vacant.",
+    "Let me help you with your baggage.",
+    "The beauty of the sunset was obscured by the industrial cranes.",
+    "He embraced his new life as an eggplant.",
+    "Cursive writing is the best way to build a race track.",
+    "They got there early, and they got really good seats.",
+    "Your girlfriend bought your favorite cookie crisp cereal but forgot to get milk.",
+    "A suit of armor provides excellent sun protection on hot days.",
+    "She couldn't decide of the glass was half empty or half full so she drank it.",
+    "Never underestimate the willingness of the greedy to throw you under the bus.",
+    "She had a habit of taking showers in lemonade."
+]
+
+single_sentence = "Reality is the sum or aggregate of all that is real or existent within a system, as opposed to that which is only imaginary."
+
+SAMPLE_FROM = 'posterior' # 'prior' or 'posterior'
+TEXT = 'single_sentence' # 'same_text' or 'sentences' or 'single_sentence'
+TXT_DEPENDENCY = True
+
+''' Run Inference '''
+reference_df = pd.read_csv(Path('/Users/adamfroghyar/Data/refs_metadata.csv'), header=None, names=['ID', 'Text'], sep='|', delimiter=None)
+# # reference_df = pd.read_csv(Path('/home/big-boy/Data/LJSpeech-1.1/refs_metadata.csv'), header=None, names=['ID', 'Text'], sep='|', delimiter=None)
+
+for row in reference_df.iterrows():
+    i = row[0]
+    _id = row[1]['ID']
+    reference_txt = row[1]['Text']
+
+    sentence = sentences[i] if (TEXT == 'sentences') else reference_txt
+
+    if TEXT == 'single_sentence':
+        sentence = single_sentence
+
+    reference_path = '/Users/adamfroghyar/Data/refs/seen/{}.wav'.format(_id)
+    reference_txt = reference_txt if TXT_DEPENDENCY else None
+
+    refs = [reference_path, reference_txt] if SAMPLE_FROM == 'posterior' else None
+
+    align, spec, stop_tokens, wav = tts(
+        model,
+        sentence,
+        TTS_CONFIG,
+        use_cuda,
+        ap,
+        use_gl=True,
+        figures=True,
+        reference_info=refs,
+        style_wav=reference_path
+    )
+
+    file_handle = 'Prior' if (SAMPLE_FROM == 'prior') else 'Posterior'
+    file_id = _id if TEXT == 'single_sentence' or TEXT == 'same_text' and SAMPLE_FROM != 'prior' else i
+
+    ap.save_wav(wav, join(CURRENT_TEST_PATH, 'GMM_{}_{}.wav'.format(file_handle, file_id)))
+