inference is slow #1

Redtides0 · 2024-05-01T09:03:12Z

Nice work!
when i run the script "test_cnc.py" provided in src/test, the inference speed is fast.
but when i make a new pipeline according to "test_cnc.py", the inference speed become super slow, when i check the CPU， GPU utilization， i found that the CPU utilization is very high and GPU utilization is low.

after debugging, all models and tensors are on "cuda", but the GPU utilization is not right

here is my new pipeline code, could you give me some advice?

import sys
if './' not in sys.path:
	sys.path.append('./')
import random
import torch
import cv2
import einops
import gradio as gr
import numpy as np

from torchvision.transforms import transforms
from pytorch_lightning import seed_everything
from models.util import create_model, load_state_dict
from models.ddim_hacked import DDIMSampler
from utils.utils import resize_image, HWC3, disable_verbosity, add_gridlines
from utils.primitives import GetDepthMap, GetSODMask, GetCLIPImageEmbeddings
from prepare_data import fg_img_list, bg_img_list, prompts_list
import os
from PIL import Image

import warnings
warnings.filterwarnings("ignore")

disable_verbosity()
DEFAULT_NEGATIVE_PROMPT = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = create_model('./configs/cnc_v1.yaml').cpu()
model.load_state_dict(load_state_dict('./trained_weights/cnc_v1.ckpt'))
model = model.to(device)
print(device)
ddim_sampler = DDIMSampler(model)

get_depth = GetDepthMap(device=device)
get_mask = GetSODMask(device=device)
get_emb = GetCLIPImageEmbeddings(device=device)

to_pil = transforms.ToPILImage()


def process(fg_image,
            bg_image,
            prompt,
            fg_emb_image=None,
            bg_emb_image=None,
            depth_version="v2",
            fg_depth_control=1,
            bg_depth_control=1,
            fg_global_control=1,
            bg_global_control=1,
            hw=768,
            batch_size=1,
            neg_prompt=DEFAULT_NEGATIVE_PROMPT,
            ddim_steps=50,
            cfg_scale=7.5,
            seed=-1,
            ):
    '''
    inputs = [  fg_image,  (type="numpy")
                bg_image,  (type="numpy")
                fg_emb_image,  (type="numpy")
                bg_emb_image,  (type="numpy")
                depth_version,  (select from ["v1", "v2", "v3"])
                fg_depth_control, (minimum=0, maximum=2, value=1, step=0.1)
                bg_depth_control, (minimum=0, maximum=2, value=1, step=0.1)
                fg_global_control, (minimum=0, maximum=2, value=1, step=0.1)
                bg_global_control, (minimum=0, maximum=2, value=1, step=0.1)
                hw,                (minimum=512, maximum=1024, value=768, step=256)
                batch_size,        (minimum=1, maximum=12, value=1, step=1)
                prompt,            (type="str")
                neg_prompt,        (value=DEFAULT_NEGATIVE_PROMPT)
                ddim_steps,        (minimum=1, maximum=100, value=50, step=1)
                cfg_scale,         (minimum=1, maximum=20, value=7.5, step=0.5))
                seed,              (minimum=-1, maximum=65536, value=-1, step=1)
            ]  
    Output: [results, conditions]
    '''
    print("Running...")
    if seed == -1:
        seed = random.randint(np.iinfo(np.uint32).min, np.iinfo(np.uint32).max)

    seed_everything(seed)

    to_tensor = transforms.ToTensor()
    H, W, C = hw, hw, 3

    if fg_image is not None:
        fg_image = cv2.resize(fg_image, (W,H))
        mask_anchor = fg_image
    else:
        fg_image = np.zeros((1, 3, hw, hw))
        mask_anchor = None
        
    if bg_image is not None:
        bg_image = cv2.resize(bg_image, (W,H))
    else:
        bg_image = np.zeros((1, 3, hw, hw))

    if fg_emb_image is not None:
        fg_emb_image = cv2.resize(fg_emb_image, (W,H))
        fg_emb = get_emb(fg_emb_image)
        if mask_anchor is None:
            mask_anchor = fg_emb_image
    else:
        fg_emb_image = np.zeros((1, 3, hw, hw))
        fg_emb = torch.zeros(1,768).to(device)
        if mask_anchor is None:
            mask_anchor = fg_emb_image
        
    if bg_emb_image is not None:
        bg_emb_image = cv2.resize(bg_emb_image, (W,H))
        bg_emb = get_emb(bg_emb_image)
    else:
        bg_emb_image = np.zeros((1, 3, hw, hw))
        bg_emb = torch.zeros(1,768).to(device)

    mask_anchor_tensor = to_tensor(mask_anchor).unsqueeze(0).to(device)
    fg_mask = get_mask(mask_anchor_tensor, resize=(hw, hw))
    fg_mask_3ch = fg_mask.expand(-1, 3, -1, -1)

    if fg_image.sum().item() == 0:
        fg_depth_v1 = torch.zeros(1, 3, hw, hw).to(device)
        fg_depth_v2 = torch.zeros(1, 3, hw, hw).to(device)
        fg_depth_v3 = torch.zeros(1, 3, hw, hw).to(device)
    else:
        fg_image_tensor = to_tensor(fg_image).unsqueeze(0).to(device)
        fg_depth_v1 = get_depth(fg_image_tensor)
        fg_depth_v2 = get_depth(fg_image_tensor * fg_mask_3ch)
        fg_depth_v3 = fg_depth_v1 * fg_mask_3ch

    if bg_image.sum().item() == 0:
        bg_depth = torch.zeros(1, 3, hw, hw).to(device)
    else:
        bg_image_tensor = to_tensor(bg_image).unsqueeze(0).to(device)
        bg_depth = get_depth(bg_image_tensor)

    if depth_version=="v1":
        fg_depth = fg_depth_v1.clone()
    elif depth_version=="v2":
        fg_depth = fg_depth_v2.clone()
    elif depth_version=="v3":
        fg_depth = fg_depth_v3.clone()

    fg_depth = torch.clamp(fg_depth * fg_depth_control, min=0.0, max=1.0)
    bg_depth = torch.clamp(bg_depth * bg_depth_control, min=0.0, max=1.0)
    
    full_prompt = prompt + ' ,masterpiece, ultra detailed, high resolution'

    c_cross = model.get_learned_conditioning(full_prompt)
    uc_cross = model.get_learned_conditioning(neg_prompt)
    cond = {"bg_depth": [bg_depth.repeat(batch_size, 1, 1, 1)],
            "fg_depth": [fg_depth.repeat(batch_size, 1, 1, 1)],
            "bg_emb": [bg_emb.repeat(batch_size, 1)],
            "fg_emb": [fg_emb.repeat(batch_size, 1)],
            "c_crossattn": [c_cross.repeat(batch_size, 1, 1)],
            "mask": [fg_mask.repeat(batch_size, 1, 1, 1)]}

    un_cond = {"bg_depth": [bg_depth.repeat(batch_size, 1, 1, 1)],
            "fg_depth": [fg_depth.repeat(batch_size, 1, 1, 1)],
            "bg_emb": [bg_emb.repeat(batch_size, 1)],
            "fg_emb": [fg_emb.repeat(batch_size, 1)],
            "c_crossattn": [uc_cross.repeat(batch_size, 1, 1)],
            "mask": [fg_mask.repeat(batch_size, 1, 1, 1)]}
    
    shape = (4, H // 8, W // 8)

    samples, _ = ddim_sampler.sample(ddim_steps, batch_size, shape, cond, verbose=False,
                                                 unconditional_guidance_scale=cfg_scale,
                                                 unconditional_conditioning=un_cond,
                                                 global_strength=[fg_global_control, bg_global_control])


    x_samples = model.decode_first_stage(samples)
    x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
    results = [x_samples[i] for i in range(batch_size)]

    fg_depth_array = (einops.rearrange(fg_depth, 'b c h w -> b h w c') * 255.0).cpu().numpy().clip(0, 255).astype(np.uint8)[0]
    bg_depth_array = (einops.rearrange(bg_depth, 'b c h w -> b h w c') * 255.0).cpu().numpy().clip(0, 255).astype(np.uint8)[0]
    fg_mask_array = (einops.rearrange(fg_mask_3ch, 'b c h w -> b h w c') * 255.0).cpu().numpy().clip(0, 255).astype(np.uint8)[0]

    if fg_emb_image.sum().item() != 0:
        fg_emb_array = add_gridlines(fg_emb_image)
        bg_emb_array = add_gridlines(bg_emb_image)
    else:
        fg_emb_array = np.zeros((768,768,3))
        bg_emb_array = np.zeros((768,768,3))

    grid = np.concatenate(results,axis=1)
    
    conditions = [fg_depth_array, bg_depth_array, fg_emb_array, bg_emb_array, fg_mask_array, grid]

    return [results, conditions]

def main(batch_size):
    for index, prompt in enumerate(prompts_list):
        [results, conditions] = process(fg_image=fg_img_list[index], bg_image=bg_img_list[index], prompt=prompt, batch_size=batch_size)
        for j, result in enumerate(results):
            img = Image.fromarray(result.astype('uint8')).convert('RGB')
            file_name = f"{prompt}_00000{j}.png"
            img.save(fp=file_name)

if __name__ == "__main__":
    main(batch_size=1)

The text was updated successfully, but these errors were encountered:

Redtides0 · 2024-05-01T09:49:14Z

the problem may be from "compose-and-conquer/models/ddim_hacked.py" " p_sample_ddim" function.

model_t = self.model.apply_model(x, t, c, global_strength)
model_uncond = self.model.apply_model(x, t, unconditional_conditioning, global_strength)

the code above takes more time ( 6~7 seconds) than usual

tomtom1103 · 2024-05-01T12:11:34Z

@Redtides0 hi, thanks for taking interest in our work!

it looks like you've made a custom script to load your own triplets of fg, bg images and prompts without the need for gradio, judging from

from prepare_data import fg_img_list, bg_img_list, prompts_list

Since I don't have your custom data, I tried to recreate the slowness by switching the main function from your script to

def main(batch_size):
    ITER = 5
    np.random.seed(42)
    prompts_list = ["test" for _ in range(ITER)]
    fg_img_list = [np.random.randint(0, 256, size=(512, 512, 3), dtype=np.uint8) for _ in range(ITER)]
    bg_img_list = [np.random.randint(0, 256, size=(512, 512, 3), dtype=np.uint8) for _ in range(ITER)]

    for index, prompt in enumerate(prompts_list):
        [results, conditions] = process(fg_image=fg_img_list[index], bg_image=bg_img_list[index], prompt=prompt, batch_size=batch_size, hw=512, seed=42)
        for j, result in enumerate(results):
            img = Image.fromarray(result.astype('uint8')).convert('RGB')
            file_name = f"{prompt}_00000{index}.png"
            img.save(fp=file_name)

if __name__ == "__main__":
    main(batch_size=1)

basically creating random fg, bg images. However, I wasn't able to recreate the bottleneck.

I'm not sure what the problem might be just based off of your code since nothing seems out of the ordinary, but you could try setting the two use_checkpoint flags in configs/cnc_v1.yaml to False. These are the flags for gradient checkpointing (used during training), which in theory shouldn't affect inference, but when I ran some torch profiling, I noticed some overhead:

Where having use_checkpoint=True was about 2 seconds slower, explicitly on the CPU.

If it's still slow, you could try
from torch.profiler import profile, ProfilerActivity

and replace your script with

    shape = (4, H // 8, W // 8)

    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], 
             record_shapes=True, 
             profile_memory=True,
             with_stack=True) as prof:


        samples, _ = ddim_sampler.sample(ddim_steps, batch_size, shape, cond, verbose=False,
                                                    unconditional_guidance_scale=cfg_scale,
                                                    unconditional_conditioning=un_cond,
                                                    global_strength=[fg_global_control, bg_global_control])

    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

    x_samples = model.decode_first_stage(samples)

and post the results here, maybe that might give some information about what might be causing the bottleneck.

平安,
Tom

Redtides0 · 2024-05-01T13:20:24Z

@tomtom1103 Thanks for ur advice :)
Following the first advice, i set use_check_point = False in configs/cnc_v1.yaml, but it does not help :(

the top one is my pipeline under use_check_point = True ,
the bottom one is my pipeline under use_check_point = False (even worse)

Follwing the second advice, i get output like this (sorry for my narrow screen)

tomtom1103 · 2024-05-01T14:18:22Z

@Redtides0 those times for a single image is indeed VERY slow, and isn't normal. When running test_cnc.py (the gradio demo), how long does it take to generate a single image? is it only slow when you use your custom script without gradio?

If so, you could try adding the profiling script to test_cnc.py, and checking the results.
you might want to change the key to

print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

from your profiler table and mine, it can be seen that aten::conv2d, aten::convolution, aten::_convolution is the bulk of the computation, but yours seem to be missing aten::cudnn_convolution, which means that cuDNN isn't being utilized in handling the attention computations. This might be the reason why aten::conv2d, aten::convolution, aten::_convolution takes 17 seconds each, since its solely being computed on the CPU. could you try adding the profiling script to the gradio demo, and check if cuDNN is being properly utilized?

if cuDNN doesn't work for both the gradio demo and your custom script, there might be a problem with how torch was compiled in your environment.
elif cuDNN does work for the gradio demo but not your custom script, I'll be honest and tell you I have no idea why. your custom data loading script looks fine to me.

Redtides0 · 2024-05-01T14:47:55Z

Thanks for your detailed explanation :)

When running test_cnc.py (the gradio demo), it takes 20 seconds to generate a single image when ddim_steps=50

For is it only slow when you use your custom script without gradio?
Yes, only when i run the custom script without gradio.

And aten::cudnn_convolution EXISTS when running test_cnc.py

tomtom1103 · 2024-05-02T02:41:42Z

@Redtides0 it seems that the problem can be narrowed down to the fact that cuDNN is being utilized for test_cnc.py, but not for your custom script (since even the self CUDA time is 4 times slower). This is very odd indeed, because your script doesn't seem out of the ordinary and I don't see any parts where any of the weights might not be moved to cuda. Unfortunately, it's very hard for me to give any specific tips since I don't know about the device and environment you're running the script on, but you could start with the fact that cuDNN isn't being utilized properly. Please give me any updates if you manage to figure this one out, I'm also very curious to know why this is 🤒

Redtides0 · 2024-05-02T05:44:15Z

@tomtom1103 my card is NVIDIA-A800-SXM4-80GB, and the environment is from environment.yaml, maybe notably, I can not install the environment through conda env create -f environment.yaml because of the network in China mainland, when i separately install the dependency, I am stuck in gradio, after I update the pip from 20.3 to 24.0, it works with a DEPRECATION.

DEPRECATION: pytorch-lightning 1.6.0 has a non-standard dependency specifier torch>=1.8.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at pypa/pip#12063

when i upgrade pytorch-lightning to 1.8.0 according to pytorch-lightning/issues/18563, the error occur:

File "./models/cnc.py", line 7, in
from ldm.models.diffusion.ddpm import LatentDiffusion
File "./ldm/models/diffusion/ddpm.py", line 20, in
from pytorch_lightning.utilities.distributed import rank_zero_only
ImportError: cannot import name 'rank_zero_only' from 'pytorch_lightning.utilities.distributed' (/opt/conda/envs/cnc/lib/python3.8/site-packages/pytorch_lightning/utilities/distributed.py)

tomtom1103 · 2024-05-02T10:58:23Z

@Redtides0 I see, the problem seems to be with how pytorch-lightning handles cuDNN in your custom script. the most straightforward solution that I can think of is to create another conda env without gradio to run your custom script. If you needed to run the gradio demo, you could simply switch over to the original environment. I hope this solves your issue!

Redtides0 · 2024-05-02T12:45:09Z

Unfortunately, it didn't work :(
could you provide the conda environment through conda pack ? I plan to prepare the environment offline

Redtides0 · 2024-05-02T14:40:11Z

@tomtom1103 Good news !
I hacked process() in test_cnc.py, as long as this function is trigger by run_button.click, it works well.

tomtom1103 · 2024-05-03T04:01:27Z

@Redtides0 awesome! that's a very novel way to hack around the gradio dependency. hope you have fun with CnC!

Cheers :)

tomtom1103 closed this as completed May 3, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

inference is slow #1

inference is slow #1

Redtides0 commented May 1, 2024

Redtides0 commented May 1, 2024

tomtom1103 commented May 1, 2024 •

edited

Loading

Redtides0 commented May 1, 2024 •

edited

Loading

tomtom1103 commented May 1, 2024

Redtides0 commented May 1, 2024

tomtom1103 commented May 2, 2024

Redtides0 commented May 2, 2024 •

edited

Loading

tomtom1103 commented May 2, 2024

Redtides0 commented May 2, 2024

Redtides0 commented May 2, 2024 •

edited

Loading

tomtom1103 commented May 3, 2024

inference is slow #1

inference is slow #1

Comments

Redtides0 commented May 1, 2024

Redtides0 commented May 1, 2024

tomtom1103 commented May 1, 2024 • edited Loading

Redtides0 commented May 1, 2024 • edited Loading

tomtom1103 commented May 1, 2024

Redtides0 commented May 1, 2024

tomtom1103 commented May 2, 2024

Redtides0 commented May 2, 2024 • edited Loading

tomtom1103 commented May 2, 2024

Redtides0 commented May 2, 2024

Redtides0 commented May 2, 2024 • edited Loading

tomtom1103 commented May 3, 2024

tomtom1103 commented May 1, 2024 •

edited

Loading

Redtides0 commented May 1, 2024 •

edited

Loading

Redtides0 commented May 2, 2024 •

edited

Loading

Redtides0 commented May 2, 2024 •

edited

Loading