Skip to content

Commit

Permalink
Merge pull request #447 from nemonameless/add_largedit_uvit_mp
Browse files Browse the repository at this point in the history
add dit uvit dp mp sharding parallel
  • Loading branch information
nemonameless authored Mar 4, 2024
2 parents 7d22d5f + cdeee84 commit dcb9e75
Show file tree
Hide file tree
Showing 23 changed files with 1,487 additions and 538 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ TRAINER_INSTANCES='127.0.0.1'
MASTER='127.0.0.1:8080'
TRAINERS_NUM=1 # nnodes, machine num
TRAINING_GPUS_PER_NODE=8 # nproc_per_node
DP_DEGREE=1 # dp_parallel_degree
DP_DEGREE=8 # dp_parallel_degree
MP_DEGREE=1 # tensor_parallel_degree
SHARDING_DEGREE=1 # sharding_parallel_degree

Expand All @@ -30,7 +30,7 @@ num_workers=8
max_steps=7000000
logging_steps=50
save_steps=5000
image_logging_steps=5000
image_logging_steps=-1
seed=0

USE_AMP=True
Expand Down Expand Up @@ -68,4 +68,9 @@ ${TRAINING_PYTHON} train_image_generation_trainer.py \
--seed ${seed} \
--recompute ${recompute} \
--enable_xformers_memory_efficient_attention ${enable_xformers} \
--bf16 ${USE_AMP}
--bf16 ${USE_AMP} \
--dp_degree ${DP_DEGREE} \
--tensor_parallel_degree ${MP_DEGREE} \
--sharding_parallel_degree ${SHARDING_DEGREE} \
--pipeline_parallel_degree 1 \
--sep_parallel_degree 1 \
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ TRAINER_INSTANCES='127.0.0.1'
MASTER='127.0.0.1:8080'
TRAINERS_NUM=1 # nnodes, machine num
TRAINING_GPUS_PER_NODE=8 # nproc_per_node
DP_DEGREE=1 # dp_parallel_degree
DP_DEGREE=8 # dp_parallel_degree
MP_DEGREE=1 # tensor_parallel_degree
SHARDING_DEGREE=1 # sharding_parallel_degree

Expand Down Expand Up @@ -68,4 +68,9 @@ ${TRAINING_PYTHON} train_image_generation_trainer.py \
--seed ${seed} \
--recompute ${recompute} \
--enable_xformers_memory_efficient_attention ${enable_xformers} \
--bf16 ${USE_AMP}
--bf16 ${USE_AMP} \
--dp_degree ${DP_DEGREE} \
--tensor_parallel_degree ${MP_DEGREE} \
--sharding_parallel_degree ${SHARDING_DEGREE} \
--pipeline_parallel_degree 1 \
--sep_parallel_degree 1 \
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,22 @@ TRAINER_INSTANCES='127.0.0.1'
MASTER='127.0.0.1:8080'
TRAINERS_NUM=1 # nnodes, machine num
TRAINING_GPUS_PER_NODE=8 # nproc_per_node
DP_DEGREE=1 # dp_parallel_degree
MP_DEGREE=1 # tensor_parallel_degree
DP_DEGREE=8 # dp_parallel_degree
MP_DEGREE=2 # tensor_parallel_degree
SHARDING_DEGREE=1 # sharding_parallel_degree

accumulation_steps=2 # gradient_accumulation_steps

config_file=config/LargeDiT_3B_patch2.json
OUTPUT_DIR=./output_trainer/LargeDiT_3B_patch2_trainer

feature_path=./data/fastdit_imagenet256
batch_size=32 # per gpu
batch_size=16 # per gpu
num_workers=8
max_steps=7000000
logging_steps=50
save_steps=5000
image_logging_steps=5000
image_logging_steps=-1
seed=0

USE_AMP=True
Expand All @@ -45,7 +47,7 @@ ${TRAINING_PYTHON} train_image_generation_trainer.py \
--feature_path ${feature_path} \
--output_dir ${OUTPUT_DIR} \
--per_device_train_batch_size ${batch_size} \
--gradient_accumulation_steps 1 \
--gradient_accumulation_steps ${accumulation_steps} \
--learning_rate 1e-4 \
--weight_decay 0.0 \
--max_steps ${max_steps} \
Expand All @@ -68,4 +70,8 @@ ${TRAINING_PYTHON} train_image_generation_trainer.py \
--seed ${seed} \
--recompute ${recompute} \
--enable_xformers_memory_efficient_attention ${enable_xformers} \
--bf16 ${USE_AMP}
--bf16 ${USE_AMP} \
--dp_degree ${DP_DEGREE} \
--tensor_parallel_degree ${MP_DEGREE} \
--sharding_parallel_degree ${SHARDING_DEGREE} \
--pipeline_parallel_degree 1 \
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,25 @@ Tips:

#### 1.3.2 单机多卡训练
```bash
config_file=config/DiT_XL_patch2.json
OUTPUT_DIR=./output/DiT_XL_patch2_trainer
TRAINING_MODEL_RESUME="None"
TRAINER_INSTANCES='127.0.0.1'
MASTER='127.0.0.1:8080'
TRAINERS_NUM=1 # nnodes, machine num
TRAINING_GPUS_PER_NODE=8 # nproc_per_node
DP_DEGREE=8 # dp_parallel_degree
MP_DEGREE=1 # tensor_parallel_degree
SHARDING_DEGREE=1 # sharding_parallel_degree

# config_file=config/SiT_XL_patch2.json
# OUTPUT_DIR=./output/SiT_XL_patch2_trainer
config_file=config/DiT_XL_patch2.json
OUTPUT_DIR=./output_trainer/DiT_XL_patch2_trainer

feature_path=./data/fastdit_imagenet256
batch_size=32 # per gpu
num_workers=8
max_steps=7000000
logging_steps=50
save_steps=5000
image_logging_steps=-1
seed=0

USE_AMP=True
Expand All @@ -55,7 +63,8 @@ enable_tensorboard=True
recompute=True
enable_xformers=True

python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" train_image_generation_trainer.py \
TRAINING_PYTHON="python -m paddle.distributed.launch --master ${MASTER} --nnodes ${TRAINERS_NUM} --nproc_per_node ${TRAINING_GPUS_PER_NODE} --ips ${TRAINER_INSTANCES}"
${TRAINING_PYTHON} train_image_generation_trainer.py \
--do_train \
--feature_path ${feature_path} \
--output_dir ${OUTPUT_DIR} \
Expand All @@ -66,10 +75,10 @@ python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" train_image_gene
--max_steps ${max_steps} \
--lr_scheduler_type "constant" \
--warmup_steps 0 \
--image_logging_steps 1000 \
--image_logging_steps ${image_logging_steps} \
--logging_dir ${OUTPUT_DIR}/tb_log \
--logging_steps ${logging_steps} \
--save_steps 10000 \
--save_steps ${save_steps} \
--save_total_limit 50 \
--dataloader_num_workers ${num_workers} \
--vae_name_or_path stabilityai/sd-vae-ft-mse \
Expand All @@ -83,24 +92,42 @@ python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" train_image_gene
--seed ${seed} \
--recompute ${recompute} \
--enable_xformers_memory_efficient_attention ${enable_xformers} \
--bf16 ${USE_AMP}
--bf16 ${USE_AMP} \
--dp_degree ${DP_DEGREE} \
--tensor_parallel_degree ${MP_DEGREE} \
--sharding_parallel_degree ${SHARDING_DEGREE} \
--pipeline_parallel_degree 1 \
--sep_parallel_degree 1 \
```

### 1.4 自定义训练逻辑开启训练

#### 1.4.1 单机多卡训练
```bash
config_file=config/DiT_XL_patch2.json
results_dir=./output_notrainer/DiT_XL_patch2_notrainer

feature_path=./data/fastdit_imagenet256
global_batch_size=256
num_workers=8
max_steps=7000000
logging_steps=50
save_steps=5000

python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" \
train_image_generation_notrainer.py \
--config_file config/DiT_XL_patch2.json \
--feature_path ./data/fastdit_imagenet256 \
--global_batch_size 256
--config_file ${config_file} \
--feature_path ${feature_path} \
--global_batch_size ${global_batch_size} \
--num_workers ${num_workers} \
--log_every ${logging_steps} \
--ckpt_every ${save_steps} \
```


## 2 模型推理

待模型训练完毕,会在`output_dir`保存训练好的模型权重。注意DiT模型推理可以使用ppdiffusers中的DiTPipeline,但是SiT模型推理暂时不支持生成`Pipeline`
待模型训练完毕,会在`output_dir`保存训练好的模型权重。注意DiT模型推理可以使用ppdiffusers中的DiTPipeline,**但是SiT模型推理暂时不支持生成`Pipeline`**
可以参照运行`python infer_demo_dit.py`或者`python infer_demo_dit.py`

DiT可以使用`tools/convert_dit_to_ppdiffusers.py`生成推理所使用的`Pipeline`
Expand Down Expand Up @@ -128,22 +155,22 @@ python tools/convert_dit_to_ppdiffusers.py
在生成`Pipeline`的权重后,我们可以使用如下的代码进行推理。

```python
from ppdiffusers import DiTPipeline, DPMSolverMultistepScheduler, DDIMScheduler
import paddle
from paddlenlp.trainer import set_seed
dtype=paddle.float32
pipe=DiTPipeline.from_pretrained("./DiT_XL_2_256", paddle_dtype=dtype)
#pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)

from ppdiffusers import DDIMScheduler, DiTPipeline

dtype = paddle.float32
pipe = DiTPipeline.from_pretrained("./DiT_XL_2_256", paddle_dtype=dtype)
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)

words = ["white shark"]
words = ["golden retriever"] # class_ids [207]
class_ids = pipe.get_label_ids(words)

set_seed(42)
generator = paddle.Generator().manual_seed(0)
image = pipe(class_labels=class_ids, num_inference_steps=25, generator=generator).images[0]
image.save("white_shark.png")
print(f'\nGPU memory usage: {paddle.device.cuda.max_memory_reserved() / 1024 ** 3:.2f} GB')
image.save("result_DiT_golden_retriever.png")
```


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,17 @@
# limitations under the License.

from . import gaussian_diffusion as gd
from .dist_env import setdistenv
from .dit import DiT
from .dit_llama import DiT_Llama
from .respace import SpacedDiffusion, space_timesteps
from .trainer import LatentDiffusionTrainer
from .trainer_args import DataArguments, ModelArguments, NoTrainerTrainingArguments
from .trainer_args import (
DataArguments,
ModelArguments,
NoTrainerTrainingArguments,
TrainerArguments,
)
from .trainer_model import DiTDiffusionModel

# Modified from OpenAI's diffusion repos
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import random

import numpy as np
import paddle
import paddle.distributed as dist
from paddle.distributed import fleet
from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker


def set_hyrbid_parallel_seed(basic_seed, data_world_rank, mp_rank, pp_rank=0):
device_id = paddle.device.get_device()
assert "gpu" in device_id

random.seed(basic_seed + data_world_rank)
np.random.seed(basic_seed + data_world_rank)
paddle.seed(basic_seed + data_world_rank)

# local_seed/ global_seed is used to control dropout in ModelParallel
local_seed = 1024 + basic_seed + mp_rank * 100 + data_world_rank
global_seed = 2048 + basic_seed + data_world_rank
tracker = get_rng_state_tracker()
tracker.add("global_seed", global_seed)
tracker.add("local_seed", local_seed)


def setdistenv(args):
world_size = dist.get_world_size()
if world_size > 1:
args.dp_degree = max(args.dp_degree, 1)
args.sharding_parallel_degree = max(args.sharding_parallel_degree, 1)
args.tensor_parallel_degree = max(args.tensor_parallel_degree, 1)
args.sep_parallel_degree = max(args.sep_parallel_degree, 1)
args.pipeline_parallel_degree = max(args.pipeline_parallel_degree, 1)

assert (
world_size % (args.tensor_parallel_degree * args.pipeline_parallel_degree) == 0
), f"Total world_size:{world_size} shoule be devided by tensor_parallel_degree: {args.tensor_parallel_degree} and pipeline_parallel_degree: {args.pipeline_parallel_degree}."

args.dp_degree = world_size // (
args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree
)
strategy = fleet.DistributedStrategy()
strategy.hybrid_configs = {
"dp_degree": args.dp_degree,
"mp_degree": args.tensor_parallel_degree,
"sharding_degree": args.sharding_parallel_degree,
"pp_degree": args.pipeline_parallel_degree,
}
# strategy.find_unused_parameters = True

# set control in tensor parallel
strategy.tensor_parallel_configs = {"tensor_init_seed": args.seed}

fleet.init(is_collective=True, strategy=strategy)

args.rank = dist.get_rank()
# obtain rank message of hybrid parallel
hcg = fleet.get_hybrid_communicate_group()
args.mp_rank = hcg.get_model_parallel_rank()
args.dp_rank = hcg.get_data_parallel_rank()
args.sharding_rank = hcg.get_sharding_parallel_rank()

args.data_world_rank = args.dp_rank * args.sharding_parallel_degree + args.sharding_rank
args.data_world_size = world_size // abs(args.tensor_parallel_degree * args.pipeline_parallel_degree)
else:
args.data_world_rank = 0
args.data_world_size = 1
args.mp_rank = 0
args.rank = 0

# seed control in hybrid parallel
set_hyrbid_parallel_seed(args.seed, args.data_world_rank, args.mp_rank)
Loading

0 comments on commit dcb9e75

Please sign in to comment.