-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into ir-inferencer
- Loading branch information
Showing
4 changed files
with
432 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,148 @@ | ||
# 混合精度训练的迁移(待更新) | ||
# 混合精度训练的迁移 | ||
|
||
在0.x版中,MMEditing并不支持对整体前向过程的混合精度训练。相反,用户必须使用`auto_fp16`装饰器来适配特定子模块,然后再将子模块的参数转化成fp16。这样就可以拥有对模型参数的更细粒度的控制,但是该方法使用起来很繁琐,而且用户需要自己处理一些操作,比如训练过程中损失函数的缩放 | ||
|
||
MMagic 1.x版使用了MMEngine提供的`AmpOptimWrapper`,在`AmpOptimWrapper.update_params`中,梯度缩放和`GradScaler`更新将被自动执行,且在`optim_context`上下文管理其中,`auto_cast`被应用到整个前向过程中。 | ||
|
||
具体来说,0.x版和1.x版之间的差异如下所示: | ||
|
||
<table class="docutils"> | ||
<thead> | ||
<tr> | ||
<th> 0.x版 </th> | ||
<th> 1.x版 </th> | ||
</tr> | ||
</thead> | ||
<tbody> | ||
<tr> | ||
<td valign="top"> | ||
|
||
```python | ||
# 配置 | ||
runner = dict(fp16_loss_scaler=dict(init_scale=512)) | ||
``` | ||
|
||
```python | ||
# 代码 | ||
import torch.nn as nn | ||
from mmedit.models.builder import build_model | ||
from mmedit.core.runners.fp16_utils import auto_fp16 | ||
|
||
|
||
class DemoModule(nn.Module): | ||
def __init__(self, cfg): | ||
self.net = build_model(cfg) | ||
|
||
@auto_fp16 | ||
def forward(self, x): | ||
return self.net(x) | ||
|
||
class DemoModel(nn.Module): | ||
|
||
def __init__(self, cfg): | ||
super().__init__(self) | ||
self.demo_network = DemoModule(cfg) | ||
|
||
def train_step(self, | ||
data_batch, | ||
optimizer, | ||
ddp_reducer=None, | ||
loss_scaler=None, | ||
use_apex_amp=False, | ||
running_status=None): | ||
# 从data_batch中获取数据 | ||
inputs = data_batch['img'] | ||
output = self.demo_network(inputs) | ||
|
||
optimizer.zero_grad() | ||
loss, log_vars = self.get_loss(data_dict_) | ||
|
||
if ddp_reducer is not None: | ||
ddp_reducer.prepare_for_backward(_find_tensors(loss_disc)) | ||
|
||
if loss_scaler: | ||
# 添加fp16支持 | ||
loss_scaler.scale(loss_disc).backward() | ||
elif use_apex_amp: | ||
from apex import amp | ||
with amp.scale_loss(loss_disc, optimizer, | ||
loss_id=0) as scaled_loss_disc: | ||
scaled_loss_disc.backward() | ||
else: | ||
loss_disc.backward() | ||
|
||
if loss_scaler: | ||
loss_scaler.unscale_(optimizer) | ||
loss_scaler.step(optimizer) | ||
else: | ||
optimizer.step() | ||
``` | ||
|
||
</td> | ||
|
||
<td valign="top"> | ||
|
||
```python | ||
# 配置 | ||
optim_wrapper = dict( | ||
constructor='OptimWrapperConstructor', | ||
generator=dict( | ||
accumulative_counts=8, | ||
optimizer=dict(type='Adam', lr=0.0001, betas=(0.0, 0.999), eps=1e-06), | ||
type='AmpOptimWrapper', # 使用amp封装器 | ||
loss_scale='dynamic'), | ||
discriminator=dict( | ||
accumulative_counts=8, | ||
optimizer=dict(type='Adam', lr=0.0004, betas=(0.0, 0.999), eps=1e-06), | ||
type='AmpOptimWrapper', # 使用amp封装器 | ||
loss_scale='dynamic')) | ||
``` | ||
|
||
```python | ||
# 代码 | ||
import torch.nn as nn | ||
from mmagic.registry import MODULES | ||
from mmengine.model import BaseModel | ||
|
||
|
||
class DemoModule(nn.Module): | ||
def __init__(self, cfg): | ||
self.net = MODULES.build(cfg) | ||
|
||
def forward(self, x): | ||
return self.net(x) | ||
|
||
class DemoModel(BaseModel): | ||
def __init__(self, cfg): | ||
super().__init__(self) | ||
self.demo_network = DemoModule(cfg) | ||
|
||
def train_step(self, data, optim_wrapper): | ||
# 从data_batch中获取数据 | ||
data = self.data_preprocessor(data, True) | ||
inputs = data['inputs'] | ||
|
||
with optim_wrapper.optim_context(self.discriminator): | ||
output = self.demo_network(inputs) | ||
loss_dict = self.get_loss(output) | ||
# 使用`BaseModel`提供的parse_loss | ||
loss, log_vars = self.parse_loss(loss_dict) | ||
optimizer_wrapper.update_params(loss) | ||
|
||
return log_vars | ||
``` | ||
|
||
</td> | ||
|
||
</tr> | ||
</tbody> | ||
</table> | ||
|
||
若要避免用户操作配置文件,MMagic在`train.py`里提供了`--amp` 选项,其可以让用户在不修改配置文件的情况下启动混合精度训练,用户可以使用以下命令启动混合精度训练: | ||
|
||
```bash | ||
bash tools/dist_train.sh CONFIG GPUS --amp | ||
|
||
# 对slurm用户 | ||
bash tools/slurm_train.sh PARTITION JOB_NAME CONFIG WORK_DIR --amp | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,81 @@ | ||
# 分布式训练的迁移(待更新) | ||
# 分布式训练的迁移 | ||
|
||
我们已经将[MMGeneration 1.x](https://github.com/open-mmlab/mmgeneration/tree/1.x)合并至MMagic。以下是针对MMGeneration中分布式训练的迁移事项。 | ||
|
||
在0.x版中,MMGeneration使用`DDPWrapper`和`DynamicRunner`来训练对应的静态和动态模型(例如PGGAN和StyleGANv2),但在1.x 版中,我们使用MMEngine提供的`MMSeparateDistributedDataParallel`来实现分布式训练。 | ||
|
||
如下是配置前后对比: | ||
|
||
<table class="docutils"> | ||
<thead> | ||
<tr> | ||
<th> 0.x版中的静态模型 </th> | ||
<th> 1.x版中的静态模型 </th> | ||
</tr> | ||
</thead> | ||
<tbody> | ||
<tr> | ||
<td valign="top"> | ||
|
||
```python | ||
# 使用DDPWrapper | ||
use_ddp_wrapper = True | ||
find_unused_parameters = False | ||
|
||
runner = dict( | ||
type='DynamicIterBasedRunner', | ||
is_dynamic_ddp=False) | ||
``` | ||
|
||
</td> | ||
<td valign="top"> | ||
|
||
```python | ||
model_wrapper_cfg = dict( | ||
type='MMSeparateDistributedDataParallel', | ||
broadcast_buffers=False, | ||
find_unused_parameters=False) | ||
``` | ||
|
||
</td> | ||
</tr> | ||
</tbody> | ||
</table> | ||
|
||
<table class="docutils"> | ||
<thead> | ||
<tr> | ||
<th> 0.x版中的动态模型 </th> | ||
<th> 1.x版中的动态模型 </th> | ||
</tr> | ||
</thead> | ||
<tbody> | ||
<tr> | ||
<td valign="top"> | ||
|
||
```python | ||
use_ddp_wrapper = False | ||
find_unused_parameters = False | ||
|
||
# 使用DynamicRunner | ||
runner = dict( | ||
type='DynamicIterBasedRunner', | ||
is_dynamic_ddp=True) | ||
``` | ||
|
||
</td> | ||
|
||
<td valign="top"> | ||
|
||
```python | ||
model_wrapper_cfg = dict( | ||
type='MMSeparateDistributedDataParallel', | ||
broadcast_buffers=False, | ||
find_unused_parameters=True) # 针对动态模型,设置`find_unused_parameters`标志为True | ||
``` | ||
|
||
</td> | ||
|
||
</tr> | ||
</tbody> | ||
</table> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,157 @@ | ||
# 优化器的迁移(待更新) | ||
# 优化器的迁移 | ||
|
||
我们已经将[MMGeneration 1.x](https://github.com/open-mmlab/mmgeneration/tree/1.x)合并至MMagic。以下是针对MMGeneration中优化器的迁移事项。 | ||
|
||
在0.x版中,MMGeneration使用PyTorch自带的优化器,其只提供了通用参数优化,而在1.x版中,我们则使用了MMEngine提供的`OptimizerWrapper`。 | ||
|
||
对比PyTorch自带的`Optimizer`,`OptimizerWrapper`可以支持如下功能: | ||
|
||
- `OptimizerWrapper.update_params`在一个单一的函数中就实现了`zero_grad`,`backward`和`step` | ||
- 支持梯度自动累积 | ||
- 提供一个名为`OptimizerWrapper.optim_context`的上下文管理器来封装前向进程,`optim_context`会根据当前更新迭代数目来自动调用`torch.no_sync`,在AMP(Auto Mixed Precision)训练中,`autocast`也会在`optim_context`中被调用。 | ||
|
||
对GAN模型,生成器和鉴别器采用不同的优化器和训练策略。要使GAN模型的`train_step`函数签名和其它模型的保持一致,我们使用从`OptimizerWrapper`继承下来的`OptimWrapperDict`来封装生成器和鉴别器的优化器,为了便于该流程的自动化MMagic实现了`MultiOptimWrapperContructor`构造器。如你想训练GAN模型,那么应该在你的配置中指定该构造器。 | ||
|
||
如下是0.x版和1.x版的配置对比 | ||
|
||
<table class="docutils"> | ||
<thead> | ||
<tr> | ||
<th> 0.x版 </th> | ||
<th> 1.x版 </th> | ||
</tr> | ||
</thead> | ||
<tbody> | ||
<tr> | ||
<td valign="top"> | ||
|
||
```python | ||
optimizer = dict( | ||
generator=dict(type='Adam', lr=0.0001, betas=(0.0, 0.999), eps=1e-6), | ||
discriminator=dict(type='Adam', lr=0.0004, betas=(0.0, 0.999), eps=1e-6)) | ||
``` | ||
|
||
</td> | ||
|
||
<td valign="top"> | ||
|
||
```python | ||
optim_wrapper = dict( | ||
constructor='MultiOptimWrapperConstructor', | ||
generator=dict(optimizer=dict(type='Adam', lr=0.0002, betas=(0.0, 0.999), eps=1e-6)), | ||
discriminator=dict( | ||
optimizer=dict(type='Adam', lr=0.0004, betas=(0.0, 0.999), eps=1e-6))) | ||
``` | ||
|
||
</td> | ||
|
||
</tr> | ||
</tbody> | ||
|
||
</table> | ||
|
||
> 注意,在1.x版中,MMGeneration使用`OptimWrapper`来实现梯度累加,这就会导致在0.x版和1.x版之间,`discriminator_steps`配置(用于在多次更新鉴别器之后更新一次生成器的训练技巧)与梯度累加均出现不一致问题。 | ||
- 在0.x版中,我们在配置里使用`disc_steps`,`gen_steps`和`batch_accumulation_steps` 。`disc_steps`和`batch_accumulation_steps`会根据`train_step`的调用次数来进行统计(亦即dataloader中数据的读取次数)。因此鉴别器的一段连续性更新次数为`disc_steps // batch_accumulation_steps`。且对于生成器,`gen_steps`是生成器实际的一段连续性更新次数 | ||
- 但在1.x版中,我们在配置里则使用了`discriminator_steps`,`generator_steps` 和`accumulative_counts`。`discriminator_steps`和`generator_steps`指的是自身在更新其它模型之前的一段连续性的更新次数 | ||
以BigGAN-128配置为例。 | ||
|
||
<table class="docutils"> | ||
<thead> | ||
<tr> | ||
<th> 0.x版 </th> | ||
<th> 1.x版 </th> | ||
</tr> | ||
</thead> | ||
<tbody> | ||
<tr> | ||
<td valign="top"> | ||
|
||
```python | ||
model = dict( | ||
type='BasiccGAN', | ||
generator=dict( | ||
type='BigGANGenerator', | ||
output_scale=128, | ||
noise_size=120, | ||
num_classes=1000, | ||
base_channels=96, | ||
shared_dim=128, | ||
with_shared_embedding=True, | ||
sn_eps=1e-6, | ||
init_type='ortho', | ||
act_cfg=dict(type='ReLU', inplace=True), | ||
split_noise=True, | ||
auto_sync_bn=False), | ||
discriminator=dict( | ||
type='BigGANDiscriminator', | ||
input_scale=128, | ||
num_classes=1000, | ||
base_channels=96, | ||
sn_eps=1e-6, | ||
init_type='ortho', | ||
act_cfg=dict(type='ReLU', inplace=True), | ||
with_spectral_norm=True), | ||
gan_loss=dict(type='GANLoss', gan_type='hinge')) | ||
|
||
# 连续性更新鉴别器`disc_steps // batch_accumulation_steps = 8 // 8 = 1`次 | ||
# 连续性更新生成器`gen_steps = 1`次 | ||
# 生成器与鉴别器在每次更新之前执行`batch_accumulation_steps = 8`次梯度累加 | ||
train_cfg = dict( | ||
disc_steps=8, gen_steps=1, batch_accumulation_steps=8, use_ema=True) | ||
``` | ||
|
||
</td> | ||
|
||
<td valign="top"> | ||
|
||
```python | ||
model = dict( | ||
type='BigGAN', | ||
num_classes=1000, | ||
data_preprocessor=dict(type='DataPreprocessor'), | ||
generator=dict( | ||
type='BigGANGenerator', | ||
output_scale=128, | ||
noise_size=120, | ||
num_classes=1000, | ||
base_channels=96, | ||
shared_dim=128, | ||
with_shared_embedding=True, | ||
sn_eps=1e-6, | ||
init_type='ortho', | ||
act_cfg=dict(type='ReLU', inplace=True), | ||
split_noise=True, | ||
auto_sync_bn=False), | ||
discriminator=dict( | ||
type='BigGANDiscriminator', | ||
input_scale=128, | ||
num_classes=1000, | ||
base_channels=96, | ||
sn_eps=1e-6, | ||
init_type='ortho', | ||
act_cfg=dict(type='ReLU', inplace=True), | ||
with_spectral_norm=True), | ||
# 连续性更新鉴别器`discriminator_steps = 1`次 | ||
# 连续性更新生成器`generator_steps = 1`次 | ||
generator_steps=1, | ||
discriminator_steps=1) | ||
|
||
optim_wrapper = dict( | ||
constructor='MultiOptimWrapperConstructor', | ||
generator=dict( | ||
# 生成器在每次更新之前执行`accumulative_counts = 8`次梯度累加 | ||
accumulative_counts=8, | ||
optimizer=dict(type='Adam', lr=0.0001, betas=(0.0, 0.999), eps=1e-6)), | ||
discriminator=dict( | ||
# 鉴别器在每次更新之前执行`accumulative_counts = 8`次梯度累加 | ||
accumulative_counts=8, | ||
optimizer=dict(type='Adam', lr=0.0004, betas=(0.0, 0.999), eps=1e-6))) | ||
``` | ||
|
||
</td> | ||
|
||
</tr> | ||
</tbody> | ||
|
||
</table> |
Oops, something went wrong.