Support diffusers sd3 speedup (#945)

This PR is done: - [x] Support diffusers SD3 speed up with nexfort backend. --------- Co-authored-by: strint <[email protected]>
siliconflow · Jun 13, 2024 · 6c156e6 · 6c156e6
1 parent 323897c
commit 6c156e6
Show file tree

Hide file tree

Showing 4 changed files with 194 additions and 1 deletion.
diff --git a/imgs/nexfort_sd3_demo.png b/imgs/nexfort_sd3_demo.png
diff --git a/onediff_diffusers_extensions/examples/sd3/README.md b/onediff_diffusers_extensions/examples/sd3/README.md
@@ -0,0 +1,59 @@
+# Run SD3 with nexfort backend (Beta Release)
+
+## Environment setup
+### Set up onediff
+https://github.com/siliconflow/onediff?tab=readme-ov-file#installation
+
+### Set up nexfort backend
+https://github.com/siliconflow/onediff/tree/main/src/onediff/infer_compiler/backends/nexfort
+
+### Set up diffusers
+
+```
+pip install git+https://github.com/huggingface/diffusers.git@main
+```
+### Set up SD3
+Model version for diffusers: https://huggingface.co/stabilityai/stable-diffusion-3-medium/tree/refs%2Fpr%2F26
+
+HF pipeline: https://github.com/huggingface/diffusers/blob/main/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md
+
+## Run
+
+### Run 1024*1024 without compile (the original pytorch HF diffusers baseline)
+```
+python3 onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py \
+    --saved-image sd3.png
+```
+
+### Run 1024*1024 with compile
+
+```
+python3 onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py \
+    --compiler-config '{"mode": "max-optimize:max-autotune:freezing:benchmark:low-precision:cudagraphs", \
+    "memory_format": "channels_last"}' \
+    --saved-image sd3_compile.png
+```
+
+## Performance comparation
+
+Testing on H800, with image size of 1024*1024, iterating 28 steps.
+
+|                 | Iteration speed      | E2E Inference Time | Max CUDA Memory Used |
+| --------------- | -------------------- | ------------------ | -------------------- |
+| Baseline        | 15.56 it/s           | 1.96 s             | 18.784 GiB           |
+| Nexfort compile | 25.91 it/s (+66.5%) | 1.15 s (-41.3%)   | 18.324 GiB           |
+
+Testing on A100-PCIE-40GB, with image size of 1024*1024, iterating 28 steps.
+
+|                 | Iteration speed    | E2E Inference Time | Max CUDA Memory Used |
+| --------------- | ------------------ | ------------------ | -------------------- |
+| Baseline        | 6.66 it/s          | 4.50 s             | 18.762 GiB           |
+| Nexfort compile | 9.39 it/s (+40.9%) | 3.15 s (-30.0%)      | 17.939 GiB           |
+
+
+## Quality
+When using nexfort as the backend for onediff compilation acceleration, the generated images are lossless.
+
+<p align="center">
+<img src="../../../imgs/nexfort_sd3_demo.png">
+</p>
diff --git a/onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py b/onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py
@@ -0,0 +1,134 @@
+import argparse
+import json
+import time
+
+import torch
+from diffusers import StableDiffusion3Pipeline
+from onediffx import compile_pipe, quantize_pipe
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Use onediif (nexfort) to accelerate image generation with Stable Diffusion 3."
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="stabilityai/stable-diffusion-3-medium",
+        help="Model path or identifier.",
+    )
+    parser.add_argument(
+        "--compiler-config", type=str, help="JSON string for compiler config."
+    )
+    parser.add_argument(
+        "--quantize-config", type=str, help="JSON string for quantization config."
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="photo of a dog and a cat both standing on a red box, with a blue ball in the middle with a parrot standing on top of the ball. The box has the text 'onediff'",
+        help="Prompt for the image generation.",
+    )
+    parser.add_argument(
+        "--height", type=int, default=1024, help="Height of the generated image."
+    )
+    parser.add_argument(
+        "--width", type=int, default=1024, help="Width of the generated image."
+    )
+    parser.add_argument(
+        "--num-inference-steps",
+        type=int,
+        default=28,
+        help="Number of inference steps."
+    )
+    parser.add_argument(
+        "--saved-image",
+        type=str,
+        default="./sd3.png",
+        help="Path to save the generated image.",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=1, help="Seed for random number generation."
+    )
+    return parser.parse_args()
+
+
+args = parse_args()
+
+device = torch.device("cuda")
+
+
+class SD3Generator:
+    def __init__(self, model, compiler_config=None, quantize_config=None):
+        self.pipe = StableDiffusion3Pipeline.from_pretrained(
+            model, torch_dtype=torch.float16, revision="refs/pr/26"
+        )
+        self.pipe.to(device)
+
+        if compiler_config:
+            print("compile...")
+            self.pipe = self.compile_pipe(self.pipe, compiler_config)
+
+        if quantize_config:
+            print("quant...")
+            self.pipe = self.quantize_pipe(self.pipe, quantize_config)
+
+    def warmup(self, gen_args, warmup_iterations=1):
+        warmup_args = gen_args.copy()
+
+        warmup_args["generator"] = torch.Generator(device=device).manual_seed(0)
+
+        print("Starting warmup...")
+        for _ in range(warmup_iterations):
+            self.pipe(**warmup_args)
+        print("Warmup complete.")
+
+    def generate(self, gen_args):
+        self.warmup(gen_args)
+
+        gen_args["generator"] = torch.Generator(device=device).manual_seed(args.seed)
+
+        # Run the model
+        start_time = time.time()
+        images = self.pipe(**gen_args).images
+        end_time = time.time()
+
+        images[0].save(args.saved_image)
+
+        return images[0], end_time - start_time
+
+    def compile_pipe(self, pipe, compiler_config):
+        options = compiler_config
+        pipe = compile_pipe(
+            pipe, backend="nexfort", options=options, fuse_qkv_projections=True
+        )
+        return pipe
+
+    def quantize_pipe(self, pipe, quantize_config):
+        pipe = quantize_pipe(pipe, ignores=[], **quantize_config)
+        return pipe
+
+
+def main():
+    compiler_config = eval(args.compiler_config) if args.compiler_config else None
+    quantize_config = eval(args.quantize_config) if args.quantize_config else None
+
+    sd3 = SD3Generator(args.model, compiler_config, quantize_config)
+
+    gen_args = {
+        "prompt": args.prompt,
+        "num_inference_steps": args.num_inference_steps,
+        "height": args.height,
+        "width": args.width,
+    }
+
+    image, inference_time = sd3.generate(gen_args)
+    print(
+        f"Generated image saved to {args.saved_image} in {inference_time:.2f} seconds."
+    )
+    cuda_mem_after_used = torch.cuda.max_memory_allocated() / (1024**3)
+    print(f"Max used CUDA memory : {cuda_mem_after_used:.3f}GiB")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/onediff_diffusers_extensions/setup.py b/onediff_diffusers_extensions/setup.py
@@ -22,7 +22,7 @@ def get_version():
     python_requires=">=3.7.0",
     install_requires=[
         "transformers>=4.27.1",
-        "diffusers>=0.19.3,<=0.27.0",
+        "diffusers>=0.19.3",
         "accelerate",
         "torch",
         "onefx",