Skip to content

Commit

Permalink
mac eval and melgan config
Browse files Browse the repository at this point in the history
  • Loading branch information
a-froghyar committed May 23, 2021
1 parent 4aaf406 commit 64cff14
Show file tree
Hide file tree
Showing 2 changed files with 340 additions and 0 deletions.
170 changes: 170 additions & 0 deletions TTS/vocoder/configs/thorsten_melgan_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
{
"github_branch":"* main",
"restore_path":"/home/thorsten/___prj/tts/models/vocoder/fullband-melgan-main/output/thorsten-dca-fullband-melgan-main-branch-May-11-2021_02+26PM-0ee3eee/checkpoint_500000.pth.tar",
"github_branch":"* main",
"restore_path":"/home/thorsten/___prj/tts/models/vocoder/fullband-melgan-main/output/thorsten-dca-fullband-melgan-main-branch-May-06-2021_10+34AM-0ee3eee/checkpoint_400000.pth.tar",
"github_branch":"* main",
"restore_path":"/home/thorsten/___prj/tts/models/vocoder/fullband-melgan-main/output/thorsten-dca-fullband-melgan-main-branch-May-01-2021_06+07AM-0ee3eee/checkpoint_300000.pth.tar",
"github_branch":"* main",
"restore_path":"/home/thorsten/___prj/tts/models/vocoder/fullband-melgan-main/output/thorsten-dca-fullband-melgan-main-branch-April-28-2021_10+55PM-0ee3eee/checkpoint_200000.pth.tar",
"github_branch":"* main",
"restore_path":"/home/thorsten/___prj/tts/models/vocoder/fullband-melgan-main/output/thorsten-dca-fullband-melgan-main-branch-April-26-2021_03+42PM-0ee3eee/checkpoint_100000.pth.tar",
"github_branch":"* main",
"github_branch":"* main",
"run_name": "thorsten-dca-fullband-melgan-main-branch",
"run_description": "FullBand-MelGAN Vocoder on thorsten v03 vocoder dataset. Commit id 0ee3eeefb553678d56c49534f3972a426a254649",

// AUDIO PARAMETERS
"audio":{
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
"win_length": 1024, // stft window length in ms.
"hop_length": 256, // stft window hop-lengh in ms.
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.

// Audio processing parameters
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.

// Silence trimming
"do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
"trim_db": 50, // threshold for timming silence. Set this according to your dataset.

// MelSpectrogram parameters
"num_mels": 80, // size of the mel spec frame.
"mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
"spec_gain": 1.0, // scaler value appplied after log transform of spectrogram.

// Normalization parameters
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
"min_level_db": -100, // lower bound for normalization
"symmetric_norm": true, // move normalization to range [-1, 1]
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
"stats_path": "/home/thorsten/___prj/tts/models/taco2/thorsten-dca/spec-stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
},

// DISTRIBUTED TRAINING
// "distributed":{
// "backend": "nccl",
// "url": "tcp:\/\/localhost:54321"
// },

// MODEL PARAMETERS
"use_pqmf": false,

// LOSS PARAMETERS
"use_stft_loss": true,
"use_subband_stft_loss": false,
"use_mse_gan_loss": true,
"use_hinge_gan_loss": false,
"use_feat_match_loss": true, // use only with melgan discriminators
"use_l1_spec_loss": true,

// loss weights
"stft_loss_weight": 0,
"subband_stft_loss_weight": 0,
"mse_G_loss_weight": 1,
"hinge_G_loss_weight": 0,
"feat_match_loss_weight": 108,
"l1_spec_loss_weight": 45,

// multiscale stft loss parameters
"stft_loss_params": {
"n_ffts": [1024, 2048, 512],
"hop_lengths": [120, 240, 50],
"win_lengths": [600, 1200, 240]
},

"l1_spec_loss_params": {
"use_mel": true,
"sample_rate": 22050,
"n_fft": 1024,
"hop_length": 256,
"win_length": 1024,
"n_mels": 80,
"mel_fmin": 50.0,
"mel_fmax": 8000.0
},

"target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch

// DISCRIMINATOR
"discriminator_model": "melgan_multiscale_discriminator",
"discriminator_model_params":{
"base_channels": 16,
"max_channels":512,
"downsample_factors":[4, 4, 4]
},
"steps_to_start_discriminator": 200000, // steps required to start GAN trainining.1
"diff_samples_for_G_and_D": true,

// GENERATOR
"generator_model": "fullband_melgan_generator",
"generator_model_params": {
"upsample_factors":[8, 8, 4],
"num_res_blocks": 4
},

// DATASET
"data_path": "/home/thorsten/___prj/tts/datasets/thorsten-de_v03/wavs/",
"feature_path": "/home/thorsten/___prj/tts/datasets/thorsten-de_v03/features/",
"seq_len": 16384,
"pad_short": 2000,
"conv_pad": 0,
"use_noise_augment": false,
"use_cache": true,

"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.

// TRAINING
"batch_size": 48, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.

// VALIDATION
"run_eval": true,
"test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
"test_sentences_file": "/home/thorsten/___prj/tts/models/taco2/thorsten-dca/test_sentences.txt", // set a file to load sentences to be used for testing. If it is null then we use default english sentences.

// OPTIMIZER
"epochs": 10000, // total number of epochs to train.
"wd": 0.0, // Weight decay weight.
"gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0
"disc_clip_grad": -1, // Discriminator gradient clipping threshold.
"lr_scheduler_gen": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
"lr_scheduler_gen_params": {
"gamma": 0.5,
"milestones": [100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000]
},
"lr_scheduler_disc": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
"lr_scheduler_disc_params": {
"gamma": 0.5,
"milestones": [100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000]
},
"lr_gen": 0.000003125, // Initial learning rate. If Noam decay is active, maximum learning rate.
"lr_disc": 0.000003125,
"optimizer": "AdamW",
"optimizer_params":{
"betas": [0.8, 0.99],
"weight_decay": 0.0
},

// TENSORBOARD and LOGGING
"print_step": 25, // Number of steps to log traning on console.
"print_eval": false, // If True, it prints loss values for each step in eval run.
"save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.

// DATA LOADING
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"num_val_loader_workers": 4, // number of evaluation data loader processes.
"eval_split_size": 10,

// PATHS
"output_path": "/home/thorsten/___prj/tts/models/vocoder/fullband-melgan-main/output"
}



170 changes: 170 additions & 0 deletions eval_mac.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
from TTS.vocoder.utils.generic_utils import setup_generator
import os
import torch
import time
import pandas as pd
from pathlib import Path
from os.path import join
import datetime

from TTS.tts.utils.generic_utils import setup_model
from TTS.utils.io import load_config
from TTS.tts.utils.text.symbols import symbols, phonemes
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.synthesis import synthesis

def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True, reference_info=None, style_wav=None):
t_1 = time.time()
reference_wav = reference_info[0] if reference_info is not None else None
reference_text = reference_info[1] if reference_info is not None else None
waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(
model,
text,
CONFIG,
use_cuda,
ap,
speaker_id,
style_wav=style_wav,
truncated=False,
enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,
use_griffin_lim=use_gl,
reference_wav=reference_wav,
reference_text=reference_text
)
mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)
if not use_gl:
waveform = vocoder_model.inference(torch.FloatTensor(mel_spec.T).unsqueeze(0))
waveform = waveform.flatten()
if use_cuda:
waveform = waveform
# waveform = waveform.numpy()
rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)
tps = (time.time() - t_1) / len(waveform)
print(waveform.shape)
print(" > Run-time: {}".format(time.time() - t_1))
print(" > Real-time factor: {}".format(rtf))
print(" > Time per step: {}".format(tps))
return alignment, mel_postnet_spec, stop_tokens, waveform


''' Runtime settings '''
use_cuda = False

''' Directory Mgmt '''

now = datetime.datetime.now()

RUN_NAME = '300_128'
TEST_PATH = Path(join(r'/Users/adamfroghyar/Models/Blizzard/', RUN_NAME, 'TESTING'))
CURRENT_TEST_PATH = Path(join(TEST_PATH, now.strftime("%Y-%m-%d %H:%M:%S")))
TEST_PATH.mkdir(parents=True, exist_ok=True)

CURRENT_TEST_PATH.mkdir(parents=True, exist_ok=True)

# model paths
TTS_MODEL = join(r'/Users/adamfroghyar/Models/Blizzard', RUN_NAME, 'best_model.pth.tar')
TTS_CONFIG = join(r'/Users/adamfroghyar/Models/Blizzard', RUN_NAME, 'config.json')
VOCODER_MODEL = "/Users/adamfroghyar/Models/BlizzardVocoder/WaveGrad/best_model.pth.tar"
VOCODER_CONFIG = "/Users/adamfroghyar/Models/BlizzardVocoder/WaveGrad/config.json"

# load configs
TTS_CONFIG = load_config(TTS_CONFIG)
# VOCODER_CONFIG = load_config(VOCODER_CONFIG)

# load the audio processor
# TTS_CONFIG.audio['stats_path'] = join(r'/home/big-boy/Models/Blizzard', 'blizzard-gts-March-17-2021_03+34PM-b4248b0', 'scale_stats.npy')

ap = AudioProcessor(**TTS_CONFIG.audio)

''' LOAD TTS MODEL '''

# multi speaker
speaker_id = None
speakers = []

# load the model
num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)
model = setup_model(num_chars, len(speakers), TTS_CONFIG)

# load model state
cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))

# load the model
model.load_state_dict(cp['model'])
if use_cuda:
model.cuda()
model.eval()

# set model stepsize
if 'r' in cp:
model.decoder.set_r(cp['r'])

''' VOCODER '''
# LOAD VOCODER MODEL
# vocoder_model = setup_generator(VOCODER_CONFIG)
# vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location="cpu")["model"])
# vocoder_model.remove_weight_norm()
# vocoder_model.inference_padding = 0

# ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio'])
# if use_cuda:
# vocoder_model.cuda()
# vocoder_model.eval()

sentences = [
"Sixty-Four comes asking for bread.",
"Two seats were vacant.",
"Let me help you with your baggage.",
"The beauty of the sunset was obscured by the industrial cranes.",
"He embraced his new life as an eggplant.",
"Cursive writing is the best way to build a race track.",
"They got there early, and they got really good seats.",
"Your girlfriend bought your favorite cookie crisp cereal but forgot to get milk.",
"A suit of armor provides excellent sun protection on hot days.",
"She couldn't decide of the glass was half empty or half full so she drank it.",
"Never underestimate the willingness of the greedy to throw you under the bus.",
"She had a habit of taking showers in lemonade."
]

single_sentence = "Reality is the sum or aggregate of all that is real or existent within a system, as opposed to that which is only imaginary."

SAMPLE_FROM = 'posterior' # 'prior' or 'posterior'
TEXT = 'single_sentence' # 'same_text' or 'sentences' or 'single_sentence'
TXT_DEPENDENCY = True

''' Run Inference '''
reference_df = pd.read_csv(Path('/Users/adamfroghyar/Data/refs_metadata.csv'), header=None, names=['ID', 'Text'], sep='|', delimiter=None)
# # reference_df = pd.read_csv(Path('/home/big-boy/Data/LJSpeech-1.1/refs_metadata.csv'), header=None, names=['ID', 'Text'], sep='|', delimiter=None)

for row in reference_df.iterrows():
i = row[0]
_id = row[1]['ID']
reference_txt = row[1]['Text']

sentence = sentences[i] if (TEXT == 'sentences') else reference_txt

if TEXT == 'single_sentence':
sentence = single_sentence

reference_path = '/Users/adamfroghyar/Data/refs/seen/{}.wav'.format(_id)
reference_txt = reference_txt if TXT_DEPENDENCY else None

refs = [reference_path, reference_txt] if SAMPLE_FROM == 'posterior' else None

align, spec, stop_tokens, wav = tts(
model,
sentence,
TTS_CONFIG,
use_cuda,
ap,
use_gl=True,
figures=True,
reference_info=refs,
style_wav=reference_path
)

file_handle = 'Prior' if (SAMPLE_FROM == 'prior') else 'Posterior'
file_id = _id if TEXT == 'single_sentence' or TEXT == 'same_text' and SAMPLE_FROM != 'prior' else i

ap.save_wav(wav, join(CURRENT_TEST_PATH, 'GMM_{}_{}.wav'.format(file_handle, file_id)))

0 comments on commit 64cff14

Please sign in to comment.