Skip to content

Commit

Permalink
small bug fix for inference
Browse files Browse the repository at this point in the history
  • Loading branch information
a-froghyar committed May 19, 2021
1 parent 6bced9e commit 4aaf406
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 33 deletions.
12 changes: 6 additions & 6 deletions TTS/bin/train_tacotron.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,12 +679,12 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch):
print("WARNING: You didn't provide a gst style wav, for this reason we use a zero tensor!")
for i in range(c.gst['gst_style_tokens']):
style_wav[str(i)] = 0
if reference_wav is None and c.use_capacitron:
reference_text = None
print("No reference wav has been defined, sampling from the prior of Capacitron.")
else:
# TODO this is not working
print("Infering prosody transfer from reference file {}.".format(reference_wav))
if c.use_capacitron:
if reference_wav is not None:
print("Infering prosody transfer from reference file {}.".format(reference_wav))
else:
reference_text = None
print("No reference wav has been defined, sampling from the prior of Capacitron.")
for idx, test_sentence in enumerate(test_sentences):
try:
wav, alignment, decoder_output, postnet_output, stop_tokens, _ = synthesis(
Expand Down
29 changes: 15 additions & 14 deletions TTS/tts/configs/capacitron_blizzard.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"model": "Tacotron",
"run_name": "capacitron",
"run_description": "Capacitron-transpose",
"run_name": "capacitron-C=150-E=128",
"run_description": "Capacitron",

// AUDIO PARAMETERS
"audio": {
Expand All @@ -14,7 +14,7 @@

// Audio processing parameters
"sample_rate": 24000, // DATASET-RELATED: wav sample-rate.
"preemphasis": 0.97, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.

// Silence trimming
Expand Down Expand Up @@ -45,7 +45,7 @@
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.

// TRAINING
"batch_size": 128, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
"batch_size": 256, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
"eval_batch_size": 16,
"r": 2, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
"gradual_training": null,
Expand All @@ -71,24 +71,24 @@

// VALIDATION
"run_eval": true,
"test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
"test_delay_epochs": 100, //Until attention is aligned, testing only wastes computation time.
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.

// OPTIMIZER
"noam_schedule": false, // use noam warmup and lr schedule.
"grad_clip": 5.0, // upper limit for gradients for clipping.
"epochs": 300000, // total number of epochs to train.
"epochs": 299, // total number of epochs to train.
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
"wd": 0.000001, // Weight decay weight.
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
"seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
"use_gradual_lr": true, // Hardcoded step-wise learning rate scheduling. Overrides noam schedule if noam is true
"gradual_learning_rates": [
[0, 1e-3],
[5e4, 5e-4],
[1e5, 3e-4],
[15e4, 1e-4],
[2e5, 5e-5]
[2e4, 5e-4],
[4e5, 3e-4],
[6e4, 1e-4],
[8e4, 5e-5]
],

// TACOTRON PRENET
Expand Down Expand Up @@ -117,18 +117,19 @@
"print_step": 100, // Number of steps to log training on console.
"tb_plot_step": 100, // Number of steps to plot TB training figures.
"print_eval": false, // If True, it prints intermediate loss values in evalulation.
"save_step": 1000, // Number of training steps expected to save traninpg stats and checkpoints.
// TODO increase this to 10K when the model is stable to save training time
"save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.

// DATA LOADING
"text_cleaner": "phoneme_cleaners",
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
"num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"num_loader_workers": 12, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"num_val_loader_workers": 8, // number of evaluation data loader processes.
"batch_group_size": 4, //Number of batches to shuffle after bucketing.
"min_seq_len": 1, // DATASET-RELATED: minimum text length to use in training
"max_seq_len": 153, // DATASET-RELATED: maximum text length
"max_seq_len": 110, // DATASET-RELATED: maximum text length
"compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
"use_noise_augment": true,
// "add_silence_end_seconds": 0.25, //Nico - how much silence we add to the end of a sentence ->> sounds better when concatenating multiple synthesised utterances
Expand Down Expand Up @@ -163,7 +164,7 @@
"capacitron_VAE_embedding_dim": 128, // Used for the output of the VAE encoder's LSTM size and for the posterior/prior distributions
"capacitron_use_text_summary_embeddings": true,
"capacitron_text_summary_embedding_dim": 128, // size for the LSTM accounting for the text summary conditional input
"capacitron_capacity": 300, // capacity target
"capacitron_capacity": 150, // capacity target
"capacitron_use_speaker_embedding": false,
"capacitron_reference_wav": "/home/big-boy/Data/blizzard2013/segmented/refs/seen/BLZ76427.wav", // path to reference wav file to be used in capacitron inference.
"capacitron_reference_text": "What does the letter mean, mother?", // string of what the file above is saying
Expand Down
2 changes: 1 addition & 1 deletion TTS/tts/models/tacotron.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def inference(self, characters, speaker_ids=None, style_mel=None, reference_mel=
if reference_text is not None:
reference_text_embedding = self.embedding(reference_text)
reference_text_length = torch.tensor([reference_text_embedding.size(1)], dtype=torch.int64) # pylint: disable=not-callable
reference_mel_length = torch.tensor([reference_mel.size(2)], dtype=torch.int64) if reference_mel is not None else None # pylint: disable=not-callable
reference_mel_length = torch.tensor([reference_mel.size(1)], dtype=torch.int64) if reference_mel is not None else None # pylint: disable=not-callable

encoder_outputs = self.encoder(inputs)
if self.gst:
Expand Down
6 changes: 3 additions & 3 deletions eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True, reference_info=

now = datetime.datetime.now()

RUN_NAME = 'capacitron-back-toEncoder-w-track_running_stats-May-12-2021_07+16AM-2840cb5'
RUN_NAME = 'capacitron-noPreemphasis-256-LRCompressed-May-17-2021_03+27PM-fca955c'
TEST_PATH = Path(join(r'/home/big-boy/Models/Blizzard/', RUN_NAME, 'TESTING'))
CURRENT_TEST_PATH = Path(join(TEST_PATH, now.strftime("%Y-%m-%d %H:%M:%S")))
TEST_PATH.mkdir(parents=True, exist_ok=True)
Expand Down Expand Up @@ -128,7 +128,7 @@ def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True, reference_info=
"She had a habit of taking showers in lemonade."
]

single_sentence = "When the president of Georgetown College was permitted to resign the office, he eagerly sailed to Europe."
single_sentence = "Reality is the sum or aggregate of all that is real or existent within a system, as opposed to that which is only imaginary."

SAMPLE_FROM = 'prior' # 'prior' or 'posterior'
TEXT = 'single_sentence' # 'same_text' or 'sentences' or 'single_sentence'
Expand Down Expand Up @@ -166,6 +166,6 @@ def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True, reference_info=
)

file_handle = 'Prior' if (SAMPLE_FROM == 'prior') else 'Posterior'
file_id = _id if TEXT == 'single_sentence' or TEXT == 'same_text' else i
file_id = _id if TEXT == 'single_sentence' or TEXT == 'same_text' and SAMPLE_FROM != 'prior' else i

ap.save_wav(wav, join(CURRENT_TEST_PATH, 'GMM_{}_{}.wav'.format(file_handle, file_id)))
27 changes: 18 additions & 9 deletions playground.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,27 +55,36 @@ def extract_axis_1(data, ind):
print(extract_axis_1(torch.tensor(matrix), to_extract))
# %%

x = torch.randn(128, 1, 151, 80)
x = torch.randn(1, 1, 125, 4) # [batch_size, 1, time_dim, embed_dim]

S = 2
W = 1 # in channels
Filter = 8 # out channels / filter size
P = int(np.ceil(((S-1)*W-S+Filter)/2))
x_pad = F.pad(x, (P//2, P//2, P//2, P//2)) # [left, right, top, bot]
# x_pad = F.pad(x, (P//2, P//2, P//2, P//2)) # [left, right, top, bot]
x_pad = F.pad(x, (1, 1, 1, 1)) # [left, right, top, bot]
# print('x shape: ', x.shape)
# print('x_pad shape: ', x_pad.shape)
# print(x_pad[0, 0, :, 1])

filters = [1] + [32, 32, 64, 64, 128, 128]
print(x.shape)
filters = [1] + [2, 2, 4, 4, 6, 6]

valid_length = torch.tensor(x.size(3))
valid_length = torch.tensor(100)
for i in range(len(filters)-1):
x = torch.nn.Conv2d(in_channels=filters[i],
out_channels=filters[i+1],
kernel_size=(3, 3),
stride=(2, 2),
padding=(2, 2))(x)
valid_length = torch.ceil(valid_length/2) + 1
print('valid_length: ', valid_length)
print(x.shape)
padding=(1, 1))(x)
valid_length = torch.tensor([torch.ceil(valid_length/2)])
post_conv_max_width = x.size(2)
mask = torch.arange(post_conv_max_width).expand(1, post_conv_max_width) < valid_length.unsqueeze(1)
mask = mask.expand(1, 1, -1, -1).transpose(2, 0).transpose(-1, 2) # [batch_size, 1, post_conv_max_width, 1]
print('unmasked: ', x[0, 0, :, :])
print('_____________')
x = x*mask
print('masked: ', x[0, 0, :, :])
print('#############')


# padded_output_shape = torch.nn.Conv2d(in_channels=1,
Expand Down

0 comments on commit 4aaf406

Please sign in to comment.