forked from coqui-ai/TTS
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
4aaf406
commit 64cff14
Showing
2 changed files
with
340 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
{ | ||
"github_branch":"* main", | ||
"restore_path":"/home/thorsten/___prj/tts/models/vocoder/fullband-melgan-main/output/thorsten-dca-fullband-melgan-main-branch-May-11-2021_02+26PM-0ee3eee/checkpoint_500000.pth.tar", | ||
"github_branch":"* main", | ||
"restore_path":"/home/thorsten/___prj/tts/models/vocoder/fullband-melgan-main/output/thorsten-dca-fullband-melgan-main-branch-May-06-2021_10+34AM-0ee3eee/checkpoint_400000.pth.tar", | ||
"github_branch":"* main", | ||
"restore_path":"/home/thorsten/___prj/tts/models/vocoder/fullband-melgan-main/output/thorsten-dca-fullband-melgan-main-branch-May-01-2021_06+07AM-0ee3eee/checkpoint_300000.pth.tar", | ||
"github_branch":"* main", | ||
"restore_path":"/home/thorsten/___prj/tts/models/vocoder/fullband-melgan-main/output/thorsten-dca-fullband-melgan-main-branch-April-28-2021_10+55PM-0ee3eee/checkpoint_200000.pth.tar", | ||
"github_branch":"* main", | ||
"restore_path":"/home/thorsten/___prj/tts/models/vocoder/fullband-melgan-main/output/thorsten-dca-fullband-melgan-main-branch-April-26-2021_03+42PM-0ee3eee/checkpoint_100000.pth.tar", | ||
"github_branch":"* main", | ||
"github_branch":"* main", | ||
"run_name": "thorsten-dca-fullband-melgan-main-branch", | ||
"run_description": "FullBand-MelGAN Vocoder on thorsten v03 vocoder dataset. Commit id 0ee3eeefb553678d56c49534f3972a426a254649", | ||
|
||
// AUDIO PARAMETERS | ||
"audio":{ | ||
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. | ||
"win_length": 1024, // stft window length in ms. | ||
"hop_length": 256, // stft window hop-lengh in ms. | ||
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. | ||
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. | ||
|
||
// Audio processing parameters | ||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. | ||
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. | ||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. | ||
|
||
// Silence trimming | ||
"do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) | ||
"trim_db": 50, // threshold for timming silence. Set this according to your dataset. | ||
|
||
// MelSpectrogram parameters | ||
"num_mels": 80, // size of the mel spec frame. | ||
"mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! | ||
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! | ||
"spec_gain": 1.0, // scaler value appplied after log transform of spectrogram. | ||
|
||
// Normalization parameters | ||
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. | ||
"min_level_db": -100, // lower bound for normalization | ||
"symmetric_norm": true, // move normalization to range [-1, 1] | ||
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] | ||
"clip_norm": true, // clip normalized values into the range. | ||
"stats_path": "/home/thorsten/___prj/tts/models/taco2/thorsten-dca/spec-stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored | ||
}, | ||
|
||
// DISTRIBUTED TRAINING | ||
// "distributed":{ | ||
// "backend": "nccl", | ||
// "url": "tcp:\/\/localhost:54321" | ||
// }, | ||
|
||
// MODEL PARAMETERS | ||
"use_pqmf": false, | ||
|
||
// LOSS PARAMETERS | ||
"use_stft_loss": true, | ||
"use_subband_stft_loss": false, | ||
"use_mse_gan_loss": true, | ||
"use_hinge_gan_loss": false, | ||
"use_feat_match_loss": true, // use only with melgan discriminators | ||
"use_l1_spec_loss": true, | ||
|
||
// loss weights | ||
"stft_loss_weight": 0, | ||
"subband_stft_loss_weight": 0, | ||
"mse_G_loss_weight": 1, | ||
"hinge_G_loss_weight": 0, | ||
"feat_match_loss_weight": 108, | ||
"l1_spec_loss_weight": 45, | ||
|
||
// multiscale stft loss parameters | ||
"stft_loss_params": { | ||
"n_ffts": [1024, 2048, 512], | ||
"hop_lengths": [120, 240, 50], | ||
"win_lengths": [600, 1200, 240] | ||
}, | ||
|
||
"l1_spec_loss_params": { | ||
"use_mel": true, | ||
"sample_rate": 22050, | ||
"n_fft": 1024, | ||
"hop_length": 256, | ||
"win_length": 1024, | ||
"n_mels": 80, | ||
"mel_fmin": 50.0, | ||
"mel_fmax": 8000.0 | ||
}, | ||
|
||
"target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch | ||
|
||
// DISCRIMINATOR | ||
"discriminator_model": "melgan_multiscale_discriminator", | ||
"discriminator_model_params":{ | ||
"base_channels": 16, | ||
"max_channels":512, | ||
"downsample_factors":[4, 4, 4] | ||
}, | ||
"steps_to_start_discriminator": 200000, // steps required to start GAN trainining.1 | ||
"diff_samples_for_G_and_D": true, | ||
|
||
// GENERATOR | ||
"generator_model": "fullband_melgan_generator", | ||
"generator_model_params": { | ||
"upsample_factors":[8, 8, 4], | ||
"num_res_blocks": 4 | ||
}, | ||
|
||
// DATASET | ||
"data_path": "/home/thorsten/___prj/tts/datasets/thorsten-de_v03/wavs/", | ||
"feature_path": "/home/thorsten/___prj/tts/datasets/thorsten-de_v03/features/", | ||
"seq_len": 16384, | ||
"pad_short": 2000, | ||
"conv_pad": 0, | ||
"use_noise_augment": false, | ||
"use_cache": true, | ||
|
||
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. | ||
|
||
// TRAINING | ||
"batch_size": 48, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. | ||
|
||
// VALIDATION | ||
"run_eval": true, | ||
"test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. | ||
"test_sentences_file": "/home/thorsten/___prj/tts/models/taco2/thorsten-dca/test_sentences.txt", // set a file to load sentences to be used for testing. If it is null then we use default english sentences. | ||
|
||
// OPTIMIZER | ||
"epochs": 10000, // total number of epochs to train. | ||
"wd": 0.0, // Weight decay weight. | ||
"gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0 | ||
"disc_clip_grad": -1, // Discriminator gradient clipping threshold. | ||
"lr_scheduler_gen": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate | ||
"lr_scheduler_gen_params": { | ||
"gamma": 0.5, | ||
"milestones": [100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000] | ||
}, | ||
"lr_scheduler_disc": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate | ||
"lr_scheduler_disc_params": { | ||
"gamma": 0.5, | ||
"milestones": [100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000] | ||
}, | ||
"lr_gen": 0.000003125, // Initial learning rate. If Noam decay is active, maximum learning rate. | ||
"lr_disc": 0.000003125, | ||
"optimizer": "AdamW", | ||
"optimizer_params":{ | ||
"betas": [0.8, 0.99], | ||
"weight_decay": 0.0 | ||
}, | ||
|
||
// TENSORBOARD and LOGGING | ||
"print_step": 25, // Number of steps to log traning on console. | ||
"print_eval": false, // If True, it prints loss values for each step in eval run. | ||
"save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. | ||
"checkpoint": true, // If true, it saves checkpoints per "save_step" | ||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. | ||
|
||
// DATA LOADING | ||
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. | ||
"num_val_loader_workers": 4, // number of evaluation data loader processes. | ||
"eval_split_size": 10, | ||
|
||
// PATHS | ||
"output_path": "/home/thorsten/___prj/tts/models/vocoder/fullband-melgan-main/output" | ||
} | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
from TTS.vocoder.utils.generic_utils import setup_generator | ||
import os | ||
import torch | ||
import time | ||
import pandas as pd | ||
from pathlib import Path | ||
from os.path import join | ||
import datetime | ||
|
||
from TTS.tts.utils.generic_utils import setup_model | ||
from TTS.utils.io import load_config | ||
from TTS.tts.utils.text.symbols import symbols, phonemes | ||
from TTS.utils.audio import AudioProcessor | ||
from TTS.tts.utils.synthesis import synthesis | ||
|
||
def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True, reference_info=None, style_wav=None): | ||
t_1 = time.time() | ||
reference_wav = reference_info[0] if reference_info is not None else None | ||
reference_text = reference_info[1] if reference_info is not None else None | ||
waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis( | ||
model, | ||
text, | ||
CONFIG, | ||
use_cuda, | ||
ap, | ||
speaker_id, | ||
style_wav=style_wav, | ||
truncated=False, | ||
enable_eos_bos_chars=CONFIG.enable_eos_bos_chars, | ||
use_griffin_lim=use_gl, | ||
reference_wav=reference_wav, | ||
reference_text=reference_text | ||
) | ||
mel_postnet_spec = ap.denormalize(mel_postnet_spec.T) | ||
if not use_gl: | ||
waveform = vocoder_model.inference(torch.FloatTensor(mel_spec.T).unsqueeze(0)) | ||
waveform = waveform.flatten() | ||
if use_cuda: | ||
waveform = waveform | ||
# waveform = waveform.numpy() | ||
rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate) | ||
tps = (time.time() - t_1) / len(waveform) | ||
print(waveform.shape) | ||
print(" > Run-time: {}".format(time.time() - t_1)) | ||
print(" > Real-time factor: {}".format(rtf)) | ||
print(" > Time per step: {}".format(tps)) | ||
return alignment, mel_postnet_spec, stop_tokens, waveform | ||
|
||
|
||
''' Runtime settings ''' | ||
use_cuda = False | ||
|
||
''' Directory Mgmt ''' | ||
|
||
now = datetime.datetime.now() | ||
|
||
RUN_NAME = '300_128' | ||
TEST_PATH = Path(join(r'/Users/adamfroghyar/Models/Blizzard/', RUN_NAME, 'TESTING')) | ||
CURRENT_TEST_PATH = Path(join(TEST_PATH, now.strftime("%Y-%m-%d %H:%M:%S"))) | ||
TEST_PATH.mkdir(parents=True, exist_ok=True) | ||
|
||
CURRENT_TEST_PATH.mkdir(parents=True, exist_ok=True) | ||
|
||
# model paths | ||
TTS_MODEL = join(r'/Users/adamfroghyar/Models/Blizzard', RUN_NAME, 'best_model.pth.tar') | ||
TTS_CONFIG = join(r'/Users/adamfroghyar/Models/Blizzard', RUN_NAME, 'config.json') | ||
VOCODER_MODEL = "/Users/adamfroghyar/Models/BlizzardVocoder/WaveGrad/best_model.pth.tar" | ||
VOCODER_CONFIG = "/Users/adamfroghyar/Models/BlizzardVocoder/WaveGrad/config.json" | ||
|
||
# load configs | ||
TTS_CONFIG = load_config(TTS_CONFIG) | ||
# VOCODER_CONFIG = load_config(VOCODER_CONFIG) | ||
|
||
# load the audio processor | ||
# TTS_CONFIG.audio['stats_path'] = join(r'/home/big-boy/Models/Blizzard', 'blizzard-gts-March-17-2021_03+34PM-b4248b0', 'scale_stats.npy') | ||
|
||
ap = AudioProcessor(**TTS_CONFIG.audio) | ||
|
||
''' LOAD TTS MODEL ''' | ||
|
||
# multi speaker | ||
speaker_id = None | ||
speakers = [] | ||
|
||
# load the model | ||
num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols) | ||
model = setup_model(num_chars, len(speakers), TTS_CONFIG) | ||
|
||
# load model state | ||
cp = torch.load(TTS_MODEL, map_location=torch.device('cpu')) | ||
|
||
# load the model | ||
model.load_state_dict(cp['model']) | ||
if use_cuda: | ||
model.cuda() | ||
model.eval() | ||
|
||
# set model stepsize | ||
if 'r' in cp: | ||
model.decoder.set_r(cp['r']) | ||
|
||
''' VOCODER ''' | ||
# LOAD VOCODER MODEL | ||
# vocoder_model = setup_generator(VOCODER_CONFIG) | ||
# vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location="cpu")["model"]) | ||
# vocoder_model.remove_weight_norm() | ||
# vocoder_model.inference_padding = 0 | ||
|
||
# ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) | ||
# if use_cuda: | ||
# vocoder_model.cuda() | ||
# vocoder_model.eval() | ||
|
||
sentences = [ | ||
"Sixty-Four comes asking for bread.", | ||
"Two seats were vacant.", | ||
"Let me help you with your baggage.", | ||
"The beauty of the sunset was obscured by the industrial cranes.", | ||
"He embraced his new life as an eggplant.", | ||
"Cursive writing is the best way to build a race track.", | ||
"They got there early, and they got really good seats.", | ||
"Your girlfriend bought your favorite cookie crisp cereal but forgot to get milk.", | ||
"A suit of armor provides excellent sun protection on hot days.", | ||
"She couldn't decide of the glass was half empty or half full so she drank it.", | ||
"Never underestimate the willingness of the greedy to throw you under the bus.", | ||
"She had a habit of taking showers in lemonade." | ||
] | ||
|
||
single_sentence = "Reality is the sum or aggregate of all that is real or existent within a system, as opposed to that which is only imaginary." | ||
|
||
SAMPLE_FROM = 'posterior' # 'prior' or 'posterior' | ||
TEXT = 'single_sentence' # 'same_text' or 'sentences' or 'single_sentence' | ||
TXT_DEPENDENCY = True | ||
|
||
''' Run Inference ''' | ||
reference_df = pd.read_csv(Path('/Users/adamfroghyar/Data/refs_metadata.csv'), header=None, names=['ID', 'Text'], sep='|', delimiter=None) | ||
# # reference_df = pd.read_csv(Path('/home/big-boy/Data/LJSpeech-1.1/refs_metadata.csv'), header=None, names=['ID', 'Text'], sep='|', delimiter=None) | ||
|
||
for row in reference_df.iterrows(): | ||
i = row[0] | ||
_id = row[1]['ID'] | ||
reference_txt = row[1]['Text'] | ||
|
||
sentence = sentences[i] if (TEXT == 'sentences') else reference_txt | ||
|
||
if TEXT == 'single_sentence': | ||
sentence = single_sentence | ||
|
||
reference_path = '/Users/adamfroghyar/Data/refs/seen/{}.wav'.format(_id) | ||
reference_txt = reference_txt if TXT_DEPENDENCY else None | ||
|
||
refs = [reference_path, reference_txt] if SAMPLE_FROM == 'posterior' else None | ||
|
||
align, spec, stop_tokens, wav = tts( | ||
model, | ||
sentence, | ||
TTS_CONFIG, | ||
use_cuda, | ||
ap, | ||
use_gl=True, | ||
figures=True, | ||
reference_info=refs, | ||
style_wav=reference_path | ||
) | ||
|
||
file_handle = 'Prior' if (SAMPLE_FROM == 'prior') else 'Posterior' | ||
file_id = _id if TEXT == 'single_sentence' or TEXT == 'same_text' and SAMPLE_FROM != 'prior' else i | ||
|
||
ap.save_wav(wav, join(CURRENT_TEST_PATH, 'GMM_{}_{}.wav'.format(file_handle, file_id))) | ||
|