Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add T5TTS #11193

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 160 additions & 0 deletions examples/tts/speechllm/conf/megatron_t5_speechllm_inference.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
name: megatron_t5_speechllm_tts_inference
checkpoint_path: ???

trainer:
devices: 1
accelerator: gpu
num_nodes: 1
precision: 32
logger: False
enable_checkpointing: False
use_distributed_sampler: False
max_epochs: 10000
max_steps: -1
log_every_n_steps: 10
val_check_interval: null
check_val_every_n_epoch: 3
gradient_clip_val: 1.0

exp_manager:
exp_dir: null
name: ${name}
create_wandb_logger: False
resume_if_exists: False
resume_ignore_no_checkpoint: True
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
save_top_k: 2
mode: min
save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below
filename: "megatron_t5_speechllm_tts--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}"
model_parallel_size: ${model.tensor_model_parallel_size}
save_best_model: True
create_early_stopping_callback: False
early_stopping_callback_params:
monitor: "val_loss"
mode: "min"
min_delta: 0.001
patience: 10
verbose: True

model:
seed: 1234
nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
virtual_prompt_style: "p-tuning" # one of 'prompt-tuning', 'p-tuning', or 'inference'
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
global_batch_size: 16
micro_batch_size: 16 # micro batch size should equal global batch size when pipeline parallel = 1
validation_global_batch_size: ${model.global_batch_size}
validation_micro_batch_size: ${model.micro_batch_size}
validation_drop_last: False
report_validation_metric: False
validation_metric: accuracy
num_speech_tokens: 10112 # Vocabulary size pertaining to speech
seq_pattern: "parallel" # parallel, delay_parallel, flatten
temperature: 0.85 # Temperature to be used for inference
top_k: 80 # Top k to be used for inference
max_inference_timesteps: 1000 # Maximum number of timesteps to run inference for

restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
language_model_path: ??? # Path to the pretrained T5 language model .nemo file, always required
save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
existing_tasks: []
new_tasks: ["squad"]
codecmodel_type: nemo_codec
codecmodel_path: ???
english_only_model: true
context_conditioning: decoder
use_flash_attention: false
lm_vocab_size: 30000
task_templates:
- taskname: "squad"
prompt_template: "<|VIRTUAL_PROMPT_0|> {context} {question} {answer}"
total_virtual_tokens: 3
virtual_token_splits: [3]
truncate_field: context
answer_field: answer

p_tuning: # P-tuning specific params
encoder_type: "mlp" # Either "mlp" or "lstm", mlp is default
num_layers: 2 # 2 recommended for MLP, 1 recommended for LSTM, must be at least 2 for mlp
dropout: 0.0

prompt_tuning: # Prompt tunin specific params
new_prompt_init_methods: ['text'] # List of 'text' or 'random', should correspond to tasks listed in new tasks
new_prompt_init_text: ['some init text goes here'] # some init text if init method is text, or None if init method is random

data:
grapheme_prefix: null
train_ds: null
validation_ds: null
test_ds: ???
max_seq_length: 1536
sample_rate: 24000
add_eos: true
add_bos: false
decoder_starts_with_pad: False
add_eos_to_decoder_output: True
add_sentinel_to_input: True
ul2_prompt_token: null # <extra_id_s>, <extra_id_r>, <extra_id_x>
shuffle: true
num_workers: 4
pin_memory: true
speech_offset: 30000
train_task: all
sup_data_path: None
num_speech_codebooks: 8
codebook_fps: 86
context_duration_min: 2.9
context_duration_max: 2.9
context_slice_method: "fixed"
phoneme_probability: 1.0
g2p:
english:
_target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
heteronyms: "scripts/tts_dataset_files/heteronyms-052722"
phoneme_probability: 0.8
ignore_ambiguous_words: False
use_chars: True
use_stresses: True
grapheme_prefix: ${model.data.grapheme_prefix}
spanish:
_target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
phoneme_dict: "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict"
phoneme_probability: 0.8
use_chars: True
use_stresses: True
ignore_ambiguous_words: False
grapheme_prefix: ${model.data.grapheme_prefix}
locale: "es-ES"
mandarin:
_target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p
phoneme_dict: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt"
word_segmenter: "jieba"
phoneme_prefix: ""
phoneme_case: "lower"
tone_prefix: "#"
ascii_letter_prefix: ${model.data.grapheme_prefix}
ascii_letter_case: "upper"
german:
_target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
phoneme_dict: "scripts/tts_dataset_files/de/de_nv230119.dict"
heteronyms: "scripts/tts_dataset_files/de/de_nv230119.heteronym"
phoneme_probability: 0.8
ignore_ambiguous_words: False
use_chars: True
use_stresses: True
grapheme_case: mixed
grapheme_prefix: ${model.data.grapheme_prefix}
locale: "de-DE"

optim:
name: fused_adam
lr: 5e-5
weight_decay: 0.01
betas:
- 0.9
- 0.98
213 changes: 213 additions & 0 deletions examples/tts/speechllm/conf/megatron_t5_speechllm_inference_model.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
name: megatron_t5_speechllm_tts_inference
checkpoint_path: ???

trainer:
devices: 1
accelerator: gpu
num_nodes: 1
precision: 32
logger: False
enable_checkpointing: False
use_distributed_sampler: False
max_epochs: 10000
max_steps: -1
log_every_n_steps: 10
val_check_interval: null
check_val_every_n_epoch: 3
gradient_clip_val: 1.0

exp_manager:
exp_dir: null
name: ${name}
create_wandb_logger: False
resume_if_exists: False
resume_ignore_no_checkpoint: True
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
save_top_k: 2
mode: min
save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below
filename: "megatron_t5_speechllm_tts--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}"
model_parallel_size: ${model.tensor_model_parallel_size}
save_best_model: True
create_early_stopping_callback: False
early_stopping_callback_params:
monitor: "val_loss"
mode: "min"
min_delta: 0.001
patience: 10
verbose: True

model:
seed: 1234
nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
virtual_prompt_style: "p-tuning" # one of 'prompt-tuning', 'p-tuning', or 'inference'
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
global_batch_size: 16
micro_batch_size: 16 # micro batch size should equal global batch size when pipeline parallel = 1
validation_global_batch_size: ${model.global_batch_size}
validation_micro_batch_size: ${model.micro_batch_size}
validation_drop_last: False
report_validation_metric: False
validation_metric: accuracy
num_speech_tokens: 10112 # Vocabulary size pertaining to speech
seq_pattern: "parallel" # parallel, delay_parallel, flatten
temperature: 0.85 # Temperature to be used for inference
top_k: 80 # Top k to be used for inference
max_inference_timesteps: 1000 # Maximum number of timesteps to run inference for

restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
existing_tasks: []
new_tasks: ["squad"]
codecmodel_type: nemo_codec
codecmodel_path: ???
english_only_model: true
context_conditioning: decoder
train_from_scratch: true
override_tokenizer_vocab_file: ???
use_flash_attention: false
lm_vocab_size: 30000

frozen_model:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
pipeline_model_parallel_split_rank: 0
make_vocab_size_divisible_by: 128
pre_process: true
post_process: true
gradient_as_bucket_view: true
native_amp_init_scale: 4294967296
native_amp_growth_interval: 1000
fp16_lm_cross_entropy: false
seed: 1234
use_cpu_initialization: false
apex_transformer_log_level: 30
tokenizer:
library: megatron
type: BertWordPieceCase
model: null
vocab_file: null
merge_file: null
optim:
name: null
data:
dataset_type: t5
encoder:
arch: transformer
bias_activation_fusion: false
use_flash_attention: ${model.use_flash_attention}
num_layers: 12
hidden_size: 768
ffn_hidden_size: 2048
num_attention_heads: 12
init_method_std: 0.015
hidden_dropout: 0.1
attention_dropout: 0.1
kv_channels: 64
activation: geglu
decoder:
arch: transformer
bias_activation_fusion: false
use_flash_attention: ${model.use_flash_attention}
num_layers: 12
hidden_size: 768
ffn_hidden_size: 2048
num_attention_heads: 12
init_method_std: 0.015
hidden_dropout: 0.1
attention_dropout: 0.1
kv_channels: 64
activation: geglu

task_templates:
- taskname: "squad"
prompt_template: "<|VIRTUAL_PROMPT_0|> {context} {question} {answer}"
total_virtual_tokens: 3
virtual_token_splits: [3]
truncate_field: context
answer_field: answer

p_tuning: # P-tuning specific params
encoder_type: "mlp" # Either "mlp" or "lstm", mlp is default
num_layers: 2 # 2 recommended for MLP, 1 recommended for LSTM, must be at least 2 for mlp
dropout: 0.0

prompt_tuning: # Prompt tunin specific params
new_prompt_init_methods: ['text'] # List of 'text' or 'random', should correspond to tasks listed in new tasks
new_prompt_init_text: ['some init text goes here'] # some init text if init method is text, or None if init method is random

data:
grapheme_prefix: null
train_ds: null
validation_ds: null
test_ds: ???
max_seq_length: 1536
sample_rate: 24000
add_eos: true
add_bos: false
decoder_starts_with_pad: False
add_eos_to_decoder_output: True
add_sentinel_to_input: True
ul2_prompt_token: null # <extra_id_s>, <extra_id_r>, <extra_id_x>
shuffle: true
num_workers: 4
pin_memory: true
speech_offset: 30000
train_task: all
sup_data_path: None
num_speech_codebooks: 8
codebook_fps: 86
context_duration_min: 2.9
context_duration_max: 2.9
context_slice_method: "fixed"
phoneme_probability: 1.0
g2p:
english:
_target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
heteronyms: "scripts/tts_dataset_files/heteronyms-052722"
phoneme_probability: 0.8
ignore_ambiguous_words: False
use_chars: True
use_stresses: True
grapheme_prefix: ${model.data.grapheme_prefix}
spanish:
_target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
phoneme_dict: "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict"
phoneme_probability: 0.8
use_chars: True
use_stresses: True
ignore_ambiguous_words: False
grapheme_prefix: ${model.data.grapheme_prefix}
locale: "es-ES"
mandarin:
_target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p
phoneme_dict: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt"
word_segmenter: "jieba"
phoneme_prefix: ""
phoneme_case: "lower"
tone_prefix: "#"
ascii_letter_prefix: ${model.data.grapheme_prefix}
ascii_letter_case: "upper"
german:
_target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
phoneme_dict: "scripts/tts_dataset_files/de/de_nv230119.dict"
heteronyms: "scripts/tts_dataset_files/de/de_nv230119.heteronym"
phoneme_probability: 0.8
ignore_ambiguous_words: False
use_chars: True
use_stresses: True
grapheme_case: mixed
grapheme_prefix: ${model.data.grapheme_prefix}
locale: "de-DE"

optim:
name: fused_adam
lr: 5e-5
weight_decay: 0.01
betas:
- 0.9
- 0.98
Loading
Loading