forked from Stability-AI/stable-audio-tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
113 lines (87 loc) · 4.12 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from prefigure.prefigure import get_all_args, push_wandb_config
import json
import os
import torch
import pytorch_lightning as pl
import random
from stable_audio_tools.data.dataset import create_dataloader_from_configs_and_args
from stable_audio_tools.models import create_model_from_config
from stable_audio_tools.models.utils import load_ckpt_state_dict
from stable_audio_tools.training import create_training_wrapper_from_config, create_demo_callback_from_config
from stable_audio_tools.training.utils import copy_state_dict
class ExceptionCallback(pl.Callback):
def on_exception(self, trainer, module, err):
print(f'{type(err).__name__}: {err}')
class ModelConfigEmbedderCallback(pl.Callback):
def __init__(self, model_config):
self.model_config = model_config
def on_save_checkpoint(self, trainer, pl_module, checkpoint):
checkpoint["model_config"] = self.model_config
def main():
args = get_all_args()
seed = args.seed
# Set a different seed for each process if using SLURM
if os.environ.get("SLURM_PROCID") is not None:
seed += int(os.environ.get("SLURM_PROCID"))
random.seed(seed)
torch.manual_seed(seed)
#Get JSON config from args.model_config
with open(args.model_config) as f:
model_config = json.load(f)
with open(args.dataset_config) as f:
dataset_config = json.load(f)
train_dl = create_dataloader_from_configs_and_args(model_config, args, dataset_config)
model = create_model_from_config(model_config)
if args.pretrained_ckpt_path:
copy_state_dict(model, load_ckpt_state_dict(args.pretrained_ckpt_path))
if args.pretransform_ckpt_path:
model.pretransform.load_state_dict(load_ckpt_state_dict(args.pretransform_ckpt_path))
training_wrapper = create_training_wrapper_from_config(model_config, model)
wandb_logger = pl.loggers.WandbLogger(project=args.name)
wandb_logger.watch(training_wrapper)
exc_callback = ExceptionCallback()
if args.save_dir and isinstance(wandb_logger.experiment.id, str):
checkpoint_dir = os.path.join(args.save_dir, wandb_logger.experiment.project, wandb_logger.experiment.id, "checkpoints")
else:
checkpoint_dir = None
ckpt_callback = pl.callbacks.ModelCheckpoint(every_n_train_steps=args.checkpoint_every, dirpath=checkpoint_dir, save_top_k=-1)
save_model_config_callback = ModelConfigEmbedderCallback(model_config)
demo_callback = create_demo_callback_from_config(model_config, demo_dl=train_dl)
#Combine args and config dicts
args_dict = vars(args)
args_dict.update({"model_config": model_config})
args_dict.update({"dataset_config": dataset_config})
push_wandb_config(wandb_logger, args_dict)
#Set multi-GPU strategy if specified
if args.strategy:
if args.strategy == "deepspeed":
from pytorch_lightning.strategies import DeepSpeedStrategy
strategy = DeepSpeedStrategy(stage=2,
contiguous_gradients=True,
overlap_comm=True,
reduce_scatter=True,
reduce_bucket_size=5e8,
allgather_bucket_size=5e8,
load_full_weights=True
)
else:
strategy = args.strategy
else:
strategy = 'ddp_find_unused_parameters_true' if args.num_gpus > 1 else "auto"
trainer = pl.Trainer(
devices=args.num_gpus,
accelerator="gpu",
num_nodes = args.num_nodes,
strategy=strategy,
precision=args.precision,
accumulate_grad_batches=args.accum_batches,
callbacks=[ckpt_callback, demo_callback, exc_callback, save_model_config_callback],
logger=wandb_logger,
log_every_n_steps=1,
max_epochs=10000000,
default_root_dir=args.save_dir,
gradient_clip_val=args.gradient_clip_val
)
trainer.fit(training_wrapper, train_dl, ckpt_path=args.ckpt_path if args.ckpt_path else None)
if __name__ == '__main__':
main()