-
Notifications
You must be signed in to change notification settings - Fork 106
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support diffusers sd3 speedup (#945)
This PR is done: - [x] Support diffusers SD3 speed up with nexfort backend. --------- Co-authored-by: strint <[email protected]>
- Loading branch information
1 parent
323897c
commit 6c156e6
Showing
4 changed files
with
194 additions
and
1 deletion.
There are no files selected for viewing
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
# Run SD3 with nexfort backend (Beta Release) | ||
|
||
## Environment setup | ||
### Set up onediff | ||
https://github.com/siliconflow/onediff?tab=readme-ov-file#installation | ||
|
||
### Set up nexfort backend | ||
https://github.com/siliconflow/onediff/tree/main/src/onediff/infer_compiler/backends/nexfort | ||
|
||
### Set up diffusers | ||
|
||
``` | ||
pip install git+https://github.com/huggingface/diffusers.git@main | ||
``` | ||
### Set up SD3 | ||
Model version for diffusers: https://huggingface.co/stabilityai/stable-diffusion-3-medium/tree/refs%2Fpr%2F26 | ||
|
||
HF pipeline: https://github.com/huggingface/diffusers/blob/main/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md | ||
|
||
## Run | ||
|
||
### Run 1024*1024 without compile (the original pytorch HF diffusers baseline) | ||
``` | ||
python3 onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py \ | ||
--saved-image sd3.png | ||
``` | ||
|
||
### Run 1024*1024 with compile | ||
|
||
``` | ||
python3 onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py \ | ||
--compiler-config '{"mode": "max-optimize:max-autotune:freezing:benchmark:low-precision:cudagraphs", \ | ||
"memory_format": "channels_last"}' \ | ||
--saved-image sd3_compile.png | ||
``` | ||
|
||
## Performance comparation | ||
|
||
Testing on H800, with image size of 1024*1024, iterating 28 steps. | ||
|
||
| | Iteration speed | E2E Inference Time | Max CUDA Memory Used | | ||
| --------------- | -------------------- | ------------------ | -------------------- | | ||
| Baseline | 15.56 it/s | 1.96 s | 18.784 GiB | | ||
| Nexfort compile | 25.91 it/s (+66.5%) | 1.15 s (-41.3%) | 18.324 GiB | | ||
|
||
Testing on A100-PCIE-40GB, with image size of 1024*1024, iterating 28 steps. | ||
|
||
| | Iteration speed | E2E Inference Time | Max CUDA Memory Used | | ||
| --------------- | ------------------ | ------------------ | -------------------- | | ||
| Baseline | 6.66 it/s | 4.50 s | 18.762 GiB | | ||
| Nexfort compile | 9.39 it/s (+40.9%) | 3.15 s (-30.0%) | 17.939 GiB | | ||
|
||
|
||
## Quality | ||
When using nexfort as the backend for onediff compilation acceleration, the generated images are lossless. | ||
|
||
<p align="center"> | ||
<img src="../../../imgs/nexfort_sd3_demo.png"> | ||
</p> |
134 changes: 134 additions & 0 deletions
134
onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
import argparse | ||
import json | ||
import time | ||
|
||
import torch | ||
from diffusers import StableDiffusion3Pipeline | ||
from onediffx import compile_pipe, quantize_pipe | ||
|
||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser( | ||
description="Use onediif (nexfort) to accelerate image generation with Stable Diffusion 3." | ||
) | ||
parser.add_argument( | ||
"--model", | ||
type=str, | ||
default="stabilityai/stable-diffusion-3-medium", | ||
help="Model path or identifier.", | ||
) | ||
parser.add_argument( | ||
"--compiler-config", type=str, help="JSON string for compiler config." | ||
) | ||
parser.add_argument( | ||
"--quantize-config", type=str, help="JSON string for quantization config." | ||
) | ||
parser.add_argument( | ||
"--prompt", | ||
type=str, | ||
default="photo of a dog and a cat both standing on a red box, with a blue ball in the middle with a parrot standing on top of the ball. The box has the text 'onediff'", | ||
help="Prompt for the image generation.", | ||
) | ||
parser.add_argument( | ||
"--height", type=int, default=1024, help="Height of the generated image." | ||
) | ||
parser.add_argument( | ||
"--width", type=int, default=1024, help="Width of the generated image." | ||
) | ||
parser.add_argument( | ||
"--num-inference-steps", | ||
type=int, | ||
default=28, | ||
help="Number of inference steps." | ||
) | ||
parser.add_argument( | ||
"--saved-image", | ||
type=str, | ||
default="./sd3.png", | ||
help="Path to save the generated image.", | ||
) | ||
parser.add_argument( | ||
"--seed", type=int, default=1, help="Seed for random number generation." | ||
) | ||
return parser.parse_args() | ||
|
||
|
||
args = parse_args() | ||
|
||
device = torch.device("cuda") | ||
|
||
|
||
class SD3Generator: | ||
def __init__(self, model, compiler_config=None, quantize_config=None): | ||
self.pipe = StableDiffusion3Pipeline.from_pretrained( | ||
model, torch_dtype=torch.float16, revision="refs/pr/26" | ||
) | ||
self.pipe.to(device) | ||
|
||
if compiler_config: | ||
print("compile...") | ||
self.pipe = self.compile_pipe(self.pipe, compiler_config) | ||
|
||
if quantize_config: | ||
print("quant...") | ||
self.pipe = self.quantize_pipe(self.pipe, quantize_config) | ||
|
||
def warmup(self, gen_args, warmup_iterations=1): | ||
warmup_args = gen_args.copy() | ||
|
||
warmup_args["generator"] = torch.Generator(device=device).manual_seed(0) | ||
|
||
print("Starting warmup...") | ||
for _ in range(warmup_iterations): | ||
self.pipe(**warmup_args) | ||
print("Warmup complete.") | ||
|
||
def generate(self, gen_args): | ||
self.warmup(gen_args) | ||
|
||
gen_args["generator"] = torch.Generator(device=device).manual_seed(args.seed) | ||
|
||
# Run the model | ||
start_time = time.time() | ||
images = self.pipe(**gen_args).images | ||
end_time = time.time() | ||
|
||
images[0].save(args.saved_image) | ||
|
||
return images[0], end_time - start_time | ||
|
||
def compile_pipe(self, pipe, compiler_config): | ||
options = compiler_config | ||
pipe = compile_pipe( | ||
pipe, backend="nexfort", options=options, fuse_qkv_projections=True | ||
) | ||
return pipe | ||
|
||
def quantize_pipe(self, pipe, quantize_config): | ||
pipe = quantize_pipe(pipe, ignores=[], **quantize_config) | ||
return pipe | ||
|
||
|
||
def main(): | ||
compiler_config = eval(args.compiler_config) if args.compiler_config else None | ||
quantize_config = eval(args.quantize_config) if args.quantize_config else None | ||
|
||
sd3 = SD3Generator(args.model, compiler_config, quantize_config) | ||
|
||
gen_args = { | ||
"prompt": args.prompt, | ||
"num_inference_steps": args.num_inference_steps, | ||
"height": args.height, | ||
"width": args.width, | ||
} | ||
|
||
image, inference_time = sd3.generate(gen_args) | ||
print( | ||
f"Generated image saved to {args.saved_image} in {inference_time:.2f} seconds." | ||
) | ||
cuda_mem_after_used = torch.cuda.max_memory_allocated() / (1024**3) | ||
print(f"Max used CUDA memory : {cuda_mem_after_used:.3f}GiB") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters