Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speaker tutorials #1146

Merged
merged 5 commits into from
Sep 10, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: &name "Voxnet"
name: &name "SpeakerNet"
sample_rate: &sample_rate 16000
repeat: &rep 1
repeat: &rep 2
dropout: &drop 0.5
separable: &separable True
n_filters: &n_filters 512
Expand All @@ -23,16 +23,16 @@ model:
time_length: 8

test_ds:
manifest_filepath: null
manifest_filepath: ???
sample_rate: 16000
labels: null
batch_size: 128
batch_size: 1
shuffle: False
time_length: 8
embedding_dir: './'
embedding_dir: '.'

preprocessor:
cls: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
params:
normalize: "per_feature"
window_size: 0.02
Expand All @@ -46,7 +46,7 @@ model:
stft_conv: false

encoder:
cls: nemo.collections.asr.modules.ConvASREncoder
_target_: nemo.collections.asr.modules.ConvASREncoder
params:
feat_in: *n_mels
activation: relu
Expand Down Expand Up @@ -99,21 +99,26 @@ model:
separable: *separable

decoder:
cls: nemo.collections.asr.modules.SpeakerDecoder
_target_: nemo.collections.asr.modules.SpeakerDecoder
params:
feat_in: *enc_feat_out
num_classes: 2
pool_mode: 'xvector'
emb_sizes: 512,512
angular: False

loss:
scale: 30
margin: 0.2

optim:
name: novograd
# cls: nemo.core.optim.optimizers.Novograd
# _target_: nemo.core.optim.optimizers.Novograd
lr: .008
# optimizer arguments
args:
name: auto
# cls: nemo.core.config.optimizers.NovogradParams
# _target_: nemo.core.config.optimizers.NovogradParams
params:
betas: [0.95, 0.5]
weight_decay: 0.001
Expand All @@ -131,21 +136,22 @@ model:
# scheduler config override
args:
name: auto
# cls: nemo.core.config.schedulers.CosineAnnealingParams
# _target_: nemo.core.config.schedulers.CosineAnnealingParams
params:
warmup_steps: null
warmup_ratio: 0.1
min_lr: 0.0
last_epoch: -1

trainer:
gpus: 2 # number of gpus
max_epochs: 100
gpus: 1 # number of gpus
max_epochs: 5
max_steps: null # computed at runtime if not set
num_nodes: 1
distributed_backend: ddp
accumulate_grad_batches: 1
amp_level: O1
amp_level: O0
deterministic: True
checkpoint_callback: False
logger: False
row_log_interval: 1 # Interval of logging.
Expand All @@ -156,10 +162,3 @@ exp_manager:
name: *name
create_tensorboard_logger: True
create_checkpoint_callback: True

hydra:
run:
dir: .
job_logging:
root:
handlers: null
155 changes: 155 additions & 0 deletions examples/speaker_recognition/conf/SpeakerNet_verification_3x2x512.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
name: &name "SpeakerNet"
sample_rate: &sample_rate 16000
repeat: &rep 2
dropout: &drop 0.5
separable: &separable True
n_filters: &n_filters 512

model:
train_ds:
manifest_filepath: ???
sample_rate: 16000
labels: null
batch_size: 64
shuffle: True
time_length: 8

validation_ds:
manifest_filepath: ???
sample_rate: 16000
labels: null
batch_size: 128
shuffle: False
time_length: 8

preprocessor:
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
params:
normalize: "per_feature"
window_size: 0.02
sample_rate: *sample_rate
window_stride: 0.01
window: "hann"
features: &n_mels 64
n_fft: 512
frame_splicing: 1
dither: 0.00001
stft_conv: false

encoder:
_target_: nemo.collections.asr.modules.ConvASREncoder
params:
feat_in: *n_mels
activation: relu
conv_mask: true

jasper:
- filters: *n_filters
repeat: 1
kernel: [3]
stride: [1]
dilation: [1]
dropout: *drop
residual: true
separable: *separable

- filters: *n_filters
repeat: *rep
kernel: [5]
stride: [1]
dilation: [1]
dropout: *drop
residual: true
separable: *separable

- filters: *n_filters
repeat: *rep
kernel: [7]
stride: [1]
dilation: [1]
dropout: *drop
residual: true
separable: *separable

- filters: *n_filters
repeat: *rep
kernel: [9]
stride: [1]
dilation: [1]
dropout: *drop
residual: true
separable: *separable

- filters: &enc_feat_out 1500
repeat: 1
kernel: [1]
stride: [1]
dilation: [1]
dropout: 0.0
residual: false
separable: *separable

decoder:
_target_: nemo.collections.asr.modules.SpeakerDecoder
params:
feat_in: *enc_feat_out
num_classes: 2
pool_mode: 'xvector'
emb_sizes: 512,512
angular: True

loss:
scale: 30
margin: 0.2

optim:
name: novograd
# _target_: nemo.core.optim.optimizers.Novograd
lr: .006
# optimizer arguments
args:
name: auto
# _target_: nemo.core.config.optimizers.NovogradParams
params:
betas: [0.95, 0.5]
weight_decay: 0.001

# scheduler setup
sched:
name: CosineAnnealing
iters_per_batch: 1 # computed at runtime
max_steps: null # computed at runtime or explicitly set here

# pytorch lightning args
monitor: val_loss
reduce_on_plateau: false

# scheduler config override
args:
name: auto
# _target_: nemo.core.config.schedulers.CosineAnnealingParams
params:
warmup_steps: null
warmup_ratio: 0.1
min_lr: 0.0
last_epoch: -1

trainer:
gpus: 1 # number of gpus
max_epochs: 200
max_steps: null # computed at runtime if not set
num_nodes: 1
distributed_backend: ddp
accumulate_grad_batches: 1
amp_level: O0
deterministic: True
checkpoint_callback: False
logger: False
row_log_interval: 1 # Interval of logging.
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations

exp_manager:
exp_dir: null
name: *name
create_tensorboard_logger: True
create_checkpoint_callback: True
8 changes: 4 additions & 4 deletions examples/speaker_recognition/speaker_reco.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@
from nemo.utils.exp_manager import exp_manager

"""
Basic run (on CPU for 50 epochs):
Basic run (on GPU for 10 epochs):
EXP_NAME=sample_run
python ./speaker_reco.py --config-path='conf' --config-name='config.yaml' \
python ./speaker_reco.py --config-path='conf' --config-name='SpeakerNet_recognition_3x2x512.yaml' \
trainer.max_epochs=10 \
model.train_ds.batch_size=64 model.validation_ds.batch_size=64 \
trainer.gpus=0 \
trainer.gpus=1 \
model.decoder.params.num_classes=2 \
exp_manager.name=$EXP_NAME +exp_manager.use_datetime_version=False \
exp_manager.exp_dir='./speaker_exps'
Expand All @@ -44,7 +44,7 @@
seed_everything(42)


@hydra_runner(config_path="conf", config_name="config")
@hydra_runner(config_path="conf", config_name="SpeakerNet_recognition_3x2x512.yaml")
def main(cfg):

logging.info(f'Hydra config: {cfg.pretty()}')
Expand Down
2 changes: 0 additions & 2 deletions nemo/collections/asr/data/audio_to_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,10 @@
from nemo.core.classes import Dataset
from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, NeuralType
from nemo.utils import logging
from nemo.utils.decorators import experimental

__all__ = ['AudioToSpeechLabelDataSet']


@experimental
class AudioToSpeechLabelDataSet(Dataset):
"""Data Layer for general speech classification.
Module which reads speech recognition with target label. It accepts comma-separated
Expand Down
65 changes: 65 additions & 0 deletions nemo/collections/asr/models/label_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import json
import os
import pickle as pkl
from typing import Dict, Optional, Union

import torch
from omegaconf import DictConfig
from omegaconf.omegaconf import open_dict
from pytorch_lightning import Trainer

from nemo.collections.asr.data.audio_to_label import AudioToSpeechLabelDataSet
Expand Down Expand Up @@ -104,6 +106,8 @@ def setup_validation_data(self, val_data_layer_config: Optional[Union[DictConfig
def setup_test_data(self, test_data_layer_params: Optional[Union[DictConfig, Dict]]):
if 'shuffle' not in test_data_layer_params:
test_data_layer_params['shuffle'] = False
if hasattr(self, 'dataset'):
test_data_layer_params['labels'] = self.dataset.labels
self.embedding_dir = test_data_layer_params.get('embedding_dir', './')
self.test_manifest = test_data_layer_params.get('manifest_filepath', None)
self._test_dl = self.__setup_dataloader_from_config(config=test_data_layer_params)
Expand Down Expand Up @@ -212,6 +216,67 @@ def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0):

return {'log': tensorboard_log}

def setup_finetune_model(self, model_config: DictConfig):
"""
setup_finetune_model method sets up training data, validation data and test data with new
provided config, this checks for the previous labels set up during training from scratch, if None,
it sets up labels for provided finetune data from manifest files

Args:
model_config: cfg which has train_ds, optional validation_ds, optional test_ds and
mandatory encoder and decoder model params
make sure you set num_classes correctly for finetune data

Returns: None

"""
if hasattr(self, 'dataset'):
scratch_labels = self.dataset.labels
else:
scratch_labels = None

logging.info("Setting up data loaders with manifests provided from model_config")

if 'train_ds' in model_config and model_config.train_ds is not None:
self.setup_training_data(model_config.train_ds)
else:
raise KeyError("train_ds is not found in model_config but you need it for fine tuning")

if self.dataset.labels is None or len(self.dataset.labels) == 0:
raise ValueError(f'New labels must be non-empty list of labels. But I got: {self.dataset.labels}')

if 'valid_ds' in model_config and model_config.valid_ds is not None:
self.setup_multiple_validation_data(model_config.valid_ds)

if 'test_ds' in model_config and model_config.test_ds is not None:
self.setup_multiple_test_data(model_config.test_ds)

if scratch_labels == self.dataset.labels: # checking for new finetune dataset labels
logging.warning(
"Trained dataset labels are same as finetune dataset labels -- continuing change of decoder parameters"
)
elif scratch_labels is None:
logging.warning(
"Either you provided a dummy manifest file during training from scratch or you restored from a pretrained nemo file"
)

decoder_config = model_config.decoder
new_decoder_config = copy.deepcopy(decoder_config)
if new_decoder_config['params']['num_classes'] != len(self.dataset.labels):
raise ValueError(
"number of classes provided {} is not same as number of different labels in finetuning data: {}".format(
new_decoder_config['params']['num_classes'], len(self.dataset.labels)
)
)

del self.decoder
self.decoder = EncDecSpeakerLabelModel.from_config_dict(new_decoder_config)

with open_dict(self._cfg.decoder):
self._cfg.decoder = new_decoder_config

logging.info(f"Changed decoder output to # {self.decoder._num_classes} classes.")


class ExtractSpeakerEmbeddingsModel(EncDecSpeakerLabelModel):
"""
Expand Down
Loading