diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
index e38749e2ef6..1d10fa5a029 100644
--- a/examples/.config/model_params_pytorch_3x.json
+++ b/examples/.config/model_params_pytorch_3x.json
@@ -147,6 +147,13 @@
         "main_script": "run_clm_no_trainer.py",
         "batch_size": 1
       },
+      "sdxl_ipex_sq":{
+        "model_src_dir": "diffusion_model/diffusers/stable_diffusion/smooth_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "main.py",
+        "batch_size": 1
+      },
       "resnet18_mixed_precision": {
         "model_src_dir": "cv/mixed_precision",
         "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
diff --git a/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/README.md b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/README.md
new file mode 100644
index 00000000000..6b37038d0dc
--- /dev/null
+++ b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/README.md
@@ -0,0 +1,83 @@
+Step-by-Step
+============
+This document describes the step-by-step instructions to run [stable diffusion XL model](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) using Smooth Quantization to accelerate inference while maintain the quality of output image.
+
+# Prerequisite
+
+## Environment
+Recommend python 3.9 or higher version.
+
+```shell
+pip install -r requirements.txt
+```
+**Note**: IPEX along with torch require nightly version (2.4) for compatibility. Please refer to [installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu&version=main&os=linux%2fwsl2&package=source). 
+
+# Run
+
+To quantize the model:
+```bash
+python sdxl_smooth_quant.py --model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 --quantize --alpha 0.44 --output_dir "./saved_results"
+```
+or
+```bash
+sh run_quant.sh --alpha=0.44
+```
+To load a quantized model:
+```bash
+python sdxl_smooth_quant.py --model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 --quantize --load --int8
+```
+or
+```bash
+sh run_quant.sh --int8=true
+```
+
+# Results
+## Image Generated
+
+With caption `"A brown and white dog runs on some brown grass near a Frisbee that is just sailing above the ground."`, results of fp32 model and int8 model are listed left and right respectively.
+
+<p float="left">
+  <img src="./images/fp32.jpg" width = "300" height = "300" alt="bf16" align=center />
+  <img src="./images/int8.jpg" width = "300" height = "300" alt="int8" align=center />
+</p>
+
+## CLIP evaluation
+We have also evaluated CLIP scores on 5000 samples from COCO2014 validation dataset for FP32 model and INT8 model. CLIP results are listed below.
+
+| Precision            | FP32  | INT8  | 
+|----------------------|-------|-------|
+| CLIP on COCO2014 val  | 32.05 | 31.77 |
+
+We're using the mlperf_sd_inference [repo](https://github.com/ahmadki/mlperf_sd_inference) to evaluate CLIP scores. In order to support evaluation on quantized model,
+we made some modification on the script (`main.py`). Please use as following:
+```bash
+git clone https://github.com/ahmadki/mlperf_sd_inference.git
+cd mlperf_sd_inference
+mv ../main.py ./
+```
+After setting the environment as instructed in the repo, you can execute the modified `main.py` script to generate images:
+```bash
+python main.py \
+    --model-id stabilityai/stable-diffusion-xl-base-1.0 \
+    --quantized-unet ./saved_results \  # quantized model saving path, should include `qconfig.json` and `quantized_model.pt`
+    --precision fp32 \ 
+    --guidance 8.0 \
+    --steps 20 \
+    --iters 200 \  # change to 5000 for the full 5k dataset
+    --latent-path latents.pt \
+    --base-output-dir ./output
+```
+Then you can compute CLIP score using the images generated by the quantized model:
+```bash
+mv ./output/stabilityai--stable-diffusion-xl-base-1.0__euler__20__8.0__fp32/* ./output/  # switch directory
+rm -rf ./output/stabilityai--stable-diffusion-xl-base-1.0__euler__20__8.0__fp32/
+
+python clip/clip_score.py \
+    --tsv-file captions_5k.tsv \
+    --image-folder ./output \  # folder with the generated images
+    --device "cpu"     
+```
+Or you can use the bash script for all steps above:
+```bash
+sh run_benchmark.sh --mode=accuracy --int8=true
+```
diff --git a/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/images/fp32.jpg b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/images/fp32.jpg
new file mode 100644
index 00000000000..387eed9a802
Binary files /dev/null and b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/images/fp32.jpg differ
diff --git a/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/images/int8.jpg b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/images/int8.jpg
new file mode 100644
index 00000000000..9a6d146894e
Binary files /dev/null and b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/images/int8.jpg differ
diff --git a/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/latents.pt b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/latents.pt
new file mode 100644
index 00000000000..208dbc48a1c
Binary files /dev/null and b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/latents.pt differ
diff --git a/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/main.py b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/main.py
new file mode 100644
index 00000000000..1f5b72fd0f0
--- /dev/null
+++ b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/main.py
@@ -0,0 +1,484 @@
+import os
+import logging
+import tempfile
+import shutil
+import argparse
+import pandas as pd
+import time
+import torch
+import intel_extension_for_pytorch as ipex
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from diffusers import (
+    DDPMScheduler,
+    DDIMScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+    StableDiffusionXLPipeline,
+    StableDiffusionXLImg2ImgPipeline,
+)
+from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import (
+    deprecate, retrieve_timesteps, rescale_noise_cfg,
+    PipelineImageInput, StableDiffusionXLPipelineOutput
+)
+
+
+class StableDiffusionXLPipelineSQ(StableDiffusionXLPipeline):
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = 'cpu'
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+                image_embeds = image_embeds.to(device)
+
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        # 8.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and isinstance(self.denoising_end, float)
+            and self.denoising_end > 0
+            and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 9. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    added_cond_kwargs=added_cond_kwargs,
+                )['sample']
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+
+            image = self.image_processor.postprocess(image.detach(), output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--model-id', default="stabilityai/stable-diffusion-xl-base-1.0", type=str)
+parser.add_argument('--precision', default='fp32', type=str)
+parser.add_argument('--base-output-dir', default="./output", type=str)
+parser.add_argument('--quantized-unet', default="./saved_results", type=str)
+parser.add_argument("--int8", action="store_true", help="Load quantized model.")
+parser.add_argument("--load", action="store_true")
+parser.add_argument('--iters', default=5000, type=int, help="Num of image generated.")
+parser.add_argument('--output-dir-name', default=None, type=str)
+parser.add_argument('--output-dir-name-postfix', default=None, type=str)
+parser.add_argument('--captions-fname', default="captions_5k.tsv", type=str)
+parser.add_argument('--guidance', default=8.0, type=float)
+parser.add_argument('--scheduler', default="euler", type=str)
+parser.add_argument('--steps', default=20, type=int)
+parser.add_argument('--negative-prompt', default="normal quality, low quality, worst quality, low res, blurry, nsfw, nude", type=str)
+parser.add_argument('--latent-path', default="latents.pt", type=str)
+parser.add_argument('--generator-seed', default=None, type=int)
+parser.add_argument("--refiner", dest='refiner', action="store_true",
+                    help="Whether to add a refiner to the SDXL pipeline."
+                          "Applicable only with --model-id=xl")
+parser.add_argument("--no-refiner", dest='refiner', action="store_false",
+                    help="Whether to add a refiner to the SDXL pipeline."
+                          "Applicable only with --model-id=xl")
+
+args = parser.parse_args()
+
+# Init the logger
+logging.basicConfig(
+    format='%(asctime)s %(levelname)-8s %(message)s',
+    level=logging.INFO,
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+
+if args.latent_path and args.generator_seed:
+    raise ValueError(
+        "Cannot specify both --latent-path and --generator-seed"
+    )
+
+if args.precision == "fp16":
+    dtype = torch.float16
+elif args.precision == "bf16":
+    dtype = torch.bfloat16
+else:
+    dtype = torch.float32
+
+# Initialize defaults
+device = torch.device('cpu')
+world_size = 1
+rank = 0
+
+# load frozen latent
+latent_noise = None
+if args.latent_path:
+    logging.info(f"[{rank}] loading latent from: {args.latent_path}")
+    latent_noise = torch.load(args.latent_path).to(dtype)
+
+logging.info(f"[{rank}] args: {args}")
+logging.info(f"[{rank}] world_size: {world_size}")
+logging.info(f"[{rank}] device: {device}")
+
+logging.info(f"[{rank}] using captions from: {args.captions_fname}")
+df = pd.read_csv(args.captions_fname, sep='\t')
+logging.info(f"[{rank}] {len(df)} captions loaded")
+
+# split captions among ranks
+df = df[rank::world_size]
+logging.info(f"[{rank}] {len(df)} captions assigned")
+
+# Build the pipeline
+schedulers = {
+    "ddpm": DDPMScheduler.from_pretrained(args.model_id, subfolder="scheduler"),
+    "ddim": DDIMScheduler.from_pretrained(args.model_id, subfolder="scheduler"),
+    "euler_anc": EulerAncestralDiscreteScheduler.from_pretrained(args.model_id, subfolder="scheduler"),
+    "euler": EulerDiscreteScheduler.from_pretrained(args.model_id, subfolder="scheduler"),
+}
+pipe = StableDiffusionXLPipelineSQ.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch_dtype=dtype,
+    use_safetensors=True,
+)
+pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
+
+if args.refiner:
+    refiner_pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(args.model_id,
+                                                                    scheduler=schedulers[args.scheduler],
+                                                                    safety_checker=None,
+                                                                    add_watermarker=False,
+                                                                    variant="fp16" if args.precision == 'fp16' else None,
+                                                                    torch_dtype=dtype)
+
+if args.int8 and args.load:
+    from neural_compressor.torch.quantization import load
+    example_inputs = {"sample": torch.randn((2, 4, 128, 128), dtype=dtype),
+                "timestep": torch.tensor(951.0),
+                "encoder_hidden_states": torch.randn((2, 77, 2048), dtype=dtype),
+                "added_cond_kwargs": {'text_embeds':torch.randn((2, 1280), dtype=dtype),
+                                    'time_ids': torch.tensor([[1024., 1024.,    0.,    0., 1024., 1024.],
+                                                                [1024., 1024.,    0.,    0., 1024., 1024.]], dtype=dtype)},}
+    q_unet = load(args.quantized_unet)
+    for _ in range(2):
+        q_unet(**example_inputs)
+    print("Loaded Quantized Model")
+    setattr(q_unet, "config", pipe.unet.config)
+    pipe.unet = q_unet
+
+pipe.set_progress_bar_config(disable=True)
+logging.info(f"[{rank}] Pipeline initialized: {pipe}")
+
+if args.refiner:
+    refiner_pipe = refiner_pipe.to(device)
+    refiner_pipe.set_progress_bar_config(disable=True)
+    logging.info(f"[{rank}] Refiner pipeline initialized: {refiner_pipe}")
+
+# Output directory
+output_dir = args.output_dir_name or f"{args.model_id.replace('/','--')}__{args.scheduler}__{args.steps}__{args.guidance}__{args.precision}"
+if args.output_dir_name_postfix is not None:
+    output_dir = f"{output_dir}_{args.output_dir_name_postfix}"
+
+output_dir = os.path.join(args.base_output_dir, output_dir)
+
+# Ensure the output directory exists
+if not os.path.exists(output_dir):
+    os.makedirs(output_dir)
+
+# Create a temporary directory to atomically move the images
+tmp_dir = tempfile.mkdtemp()
+
+# Generate the images
+for index, row in df.iterrows():
+    image_id = row['image_id']
+    caption_id = row['id']
+    caption_text = row['caption']
+
+    destination_path = os.path.join(output_dir, f"{caption_id}.png")
+
+    if index >= args.iters:
+        break
+
+    # Check if the image already exists in the output directory
+    if not os.path.exists(destination_path):
+        # Generate the image
+        print(index, caption_text)
+        tic = time.time()
+        image = pipe(prompt=caption_text,
+                     negative_prompt="normal quality, low quality, worst quality, low res, blurry, nsfw, nude",
+                     guidance_scale=8.0,
+                     generator=torch.Generator(device=device).manual_seed(args.generator_seed) if args.generator_seed else None,
+                     latents=latent_noise,
+                     num_inference_steps=20).images[0]
+        toc = time.time()
+        print("Time taken : ",toc-tic)
+
+        if args.refiner:
+            image = refiner_pipe(caption_text,
+                                 image=image).images[0]
+
+        # Save the image
+        image_path_tmp = os.path.join(tmp_dir, f"{caption_id}.png")
+        image.save(image_path_tmp)
+        shutil.move(image_path_tmp, destination_path)
+
+        logging.info(f"[{rank}] Saved image {caption_id}: {caption_text}")
diff --git a/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/requirements.txt b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/requirements.txt
new file mode 100644
index 00000000000..f1fe1f7e20f
--- /dev/null
+++ b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/requirements.txt
@@ -0,0 +1,8 @@
+diffusers
+accelerate
+torch
+transformers
+tensorboard
+intel_extension_for_pytorch
+tqdm
+open-clip-torch
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/run_benchmark.sh b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/run_benchmark.sh
new file mode 100644
index 00000000000..54046faebb1
--- /dev/null
+++ b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/run_benchmark.sh
@@ -0,0 +1,110 @@
+ert!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  model_name_or_path="stabilityai/stable-diffusion-xl-base-1.0"
+  latent="latents.pt"
+  tuned_checkpoint="./saved_results/"
+  iters=200
+  for var in "$@"
+  do
+    case $var in
+      --iters=*)
+          iters=$(echo $var | cut -f2 -d=)
+      ;;
+      --int8=*)
+          int8=$(echo $var | cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo $var | cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+      ;;
+    esac
+  done
+
+}
+
+
+# run_benchmark
+function run_benchmark {
+    extra_cmd="--load"
+    model_name_or_path="stabilityai/stable-diffusion-xl-base-1.0"
+    precision="fp32"
+    latent="latents.pt"
+    base_output_dir="./output/"
+
+    if [[ ${int8} == "true" ]]; then
+        extra_cmd=$extra_cmd" --int8"
+    fi
+    echo $extra_cmd
+
+    if [[ ${mode} == "performance" ]]; then
+      extra_cmd=$extra_cmd" --performance"
+      if [[ ${int8} == "true" ]]; then
+        extra_cmd=$extra_cmd" --int8"
+      fi
+      echo $extra_cmd
+
+      python -u sdxl_smooth_quant.py \
+        --model_name_or_path ${model_name_or_path} \
+        --latent ${latent} \
+        ${extra_cmd}
+    else
+      if [[ ${int8} == "true" ]]; then
+        extra_cmd=$extra_cmd" --int8"
+      fi
+      echo $extra_cmd
+
+      python -u sdxl_smooth_quant.py \
+        --model_name_or_path ${model_name_or_path} \
+        --latent ${latent} \
+        ${extra_cmd}
+        
+      REPO_URL="https://github.com/ahmadki/mlperf_sd_inference.git"
+      TARGET_DIR="mlperf_sd_inference"
+
+      if [ -d "$TARGET_DIR" ]; then
+        echo "Directory $TARGET_DIR already exists. Skipping git clone."
+      else
+        git clone "$REPO_URL" "$TARGET_DIR"
+      fi
+
+      cd mlperf_sd_inference
+      cp ../main.py ./
+      if [ -d "../saved_results/" ]; then
+        mv ../saved_results/ ./
+      fi
+      
+      python -u main.py \
+        --model-id ${model_name_or_path} \
+        --quantized-unet ${tuned_checkpoint} \
+        --precision ${precision} \
+        --latent-path ${latent} \
+        --base-output-dir ${base_output_dir} \
+        --iters ${iters} \
+        ${extra_cmd}
+
+      mv ./output/stabilityai--stable-diffusion-xl-base-1.0__euler__20__8.0__fp32/* ./output/
+      rm -rf ./output/stabilityai--stable-diffusion-xl-base-1.0__euler__20__8.0__fp32/
+
+      python clip/clip_score.py \
+          --tsv-file captions_5k.tsv \
+          --image-folder ${base_output_dir} \
+          --device "cpu"
+      
+      cd ..
+    fi
+
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/run_quant.sh b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/run_quant.sh
new file mode 100644
index 00000000000..e24ff49c78b
--- /dev/null
+++ b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/run_quant.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_tuning
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --alpha=*)
+          alpha=$(echo $var |cut -f2 -d=)
+      ;;
+      --int8=*)
+          int8=$(echo $var | cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+    extra_cmd=""
+    model_name_or_path="stabilityai/stable-diffusion-xl-base-1.0"
+    n_steps=20
+    calib_size=10
+    batch_size=1
+    latent="latents.pt"
+    alpha=0.44
+
+    if [[ ${int8} == "true" ]]; then
+        extra_cmd=$extra_cmd" --int8 --load"
+    else
+        extra_cmd=$extra_cmd" --quantize"
+    fi
+    echo $extra_cmd
+
+    python -u sdxl_smooth_quant.py \
+        --model_name_or_path ${model_name_or_path} \
+        --n_steps ${n_steps} \
+        --alpha ${alpha} \
+        --latent ${latent} \
+        ${extra_cmd}
+
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/sdxl_smooth_quant.py b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/sdxl_smooth_quant.py
new file mode 100644
index 00000000000..984a1696efd
--- /dev/null
+++ b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/sdxl_smooth_quant.py
@@ -0,0 +1,436 @@
+
+import os
+import argparse
+import torch
+import intel_extension_for_pytorch as ipex
+from diffusers import EulerDiscreteScheduler, StableDiffusionXLPipeline
+from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import (
+    deprecate, retrieve_timesteps, rescale_noise_cfg,
+    PipelineImageInput, StableDiffusionXLPipelineOutput
+)
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+def prompts2images(pipeline, prompts, **kwargs):
+    images = pipeline(
+        prompt=prompts,
+        num_inference_steps=kwargs["n_steps"],
+        negative_prompt=[
+            "normal quality, low quality, worst quality, low res, blurry, nsfw, nude"
+        ]
+        * len(prompts),
+        latents=kwargs["latent"],
+        guidance_scale=8.0,  # MLPerf requirements
+    ).images
+    return images
+
+def save_images(prompts, images, save_dir, prefix='ref'):
+    for prompt, image in zip(prompts, images):
+        image_name = f"{prefix}_{'_'.join(prompt.replace('/', ' ').split(' '))}.jpg"
+        image.save(os.path.join(save_dir, image_name))
+
+def do_calibration(pipeline, calibration_prompts, **kwargs):
+    for i_th, prompts in enumerate(calibration_prompts):
+        if i_th >= kwargs["calib_size"]:
+            return
+        prompts2images(pipeline, prompts, **kwargs)
+
+class StableDiffusionXLPipelineSQ(StableDiffusionXLPipeline):
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = "cpu"
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+                image_embeds = image_embeds.to(device)
+
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        # 8.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and isinstance(self.denoising_end, float)
+            and self.denoising_end > 0
+            and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 9. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    added_cond_kwargs=added_cond_kwargs,
+                )['sample']
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+
+            image = self.image_processor.postprocess(image.detach(), output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name_or_path", type=str, default="stabilityai/stable-diffusion-xl-base-1.0"
+    )
+    parser.add_argument("--quantize", action="store_true")
+    parser.add_argument("--load", action="store_true")
+    parser.add_argument("--int8", action="store_true", help="Load quantized model.")
+    parser.add_argument("--performance", action="store_true")
+    parser.add_argument("--n_steps", type=int, default=20)
+    parser.add_argument("--batch-size", type=int, default=1)
+    parser.add_argument("--calib-size", type=int, default=10)
+    parser.add_argument("--latent", type=str, default="latents.pt")
+    parser.add_argument("--alpha", type=float, default=0.5, help="SmoothQuant Alpha")
+    parser.add_argument("--output_dir", type=str, default="./saved_results", help="output directory")
+    parser.add_argument("--iters", default=10, type=int, help="For performance measurement only.")
+
+    args = parser.parse_args()
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    args.calib_size = args.calib_size // args.batch_size
+
+    dtype = torch.float32
+
+    pipeline = StableDiffusionXLPipelineSQ.from_pretrained(
+        args.model_name_or_path,
+        torch_dtype=dtype,
+        use_safetensors=True,
+    )
+    pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
+    
+    # This is a list of prompts
+    cali_prompts = [['A brown and white dog runs on some brown grass near a Frisbee that is just sailing above the ground.'],
+                    ['The bus is traveling down a two way street.']]
+
+    torch.random.manual_seed(42)
+    if args.latent is not None:
+        init_latent = torch.load(args.latent).to(dtype)
+    else:
+        init_latent = torch.randn((1,4,128,128), dtype=dtype)
+
+    prompts = cali_prompts[0]
+    ref_images = prompts2images(pipeline, prompts, n_steps=args.n_steps, latent=init_latent)
+    save_images(prompts, ref_images, args.output_dir, prefix='ref')
+
+    def forward_loop(model):
+        do_calibration(
+            pipeline=pipeline,
+            calibration_prompts=cali_prompts,
+            calib_size=args.calib_size,
+            n_steps=args.n_steps,
+            latent=init_latent,
+        )
+    
+    if args.quantize:
+        excluded_precisions = ["bf16"]
+        example_inputs = {"sample": torch.randn((2, 4, 128, 128), dtype=dtype),
+                        "timestep": torch.tensor(951.0),
+                        "encoder_hidden_states": torch.randn((2, 77, 2048), dtype=dtype),
+                        "added_cond_kwargs": {'text_embeds':torch.randn((2, 1280), dtype=dtype),
+                                              'time_ids': torch.tensor([[1024., 1024.,    0.,    0., 1024., 1024.],
+                                                                        [1024., 1024.,    0.,    0., 1024., 1024.]], dtype=dtype)},}
+        
+        from neural_compressor.torch.quantization import SmoothQuantConfig, prepare, convert
+        quant_config = SmoothQuantConfig(alpha=args.alpha, excluded_precisions=excluded_precisions)
+        user_model = prepare(model=pipeline.unet, quant_config=quant_config, example_inputs=example_inputs)
+        forward_loop(user_model)
+        q_unet = convert(user_model)
+        q_unet.save(args.output_dir)
+    
+    if args.load:
+        if args.int8:
+            from neural_compressor.torch.quantization import load
+            q_unet = load(os.path.abspath(os.path.expanduser(args.output_dir)))
+            setattr(q_unet, "config", pipeline.unet.config)
+        else:
+            q_unet = pipeline.unet
+    
+    pipeline.unet = q_unet
+    quant_images = prompts2images(pipeline, prompts, n_steps=args.n_steps, latent=init_latent)
+    save_images(prompts, quant_images, args.output_dir, prefix='quant')
+
+    if args.performance:
+        import time
+
+        total_iters = args.iters * args.batch_size
+        warmup_iters = 5
+        for i in range(total_iters):
+            if i == warmup_iters:
+                start = time.time()
+                prompts2images(pipeline, prompts, n_steps=args.n_steps, latent=init_latent)
+        end = time.time()
+
+        latency = (end - start) / ((total_iters - warmup_iters) * args.batch_size)
+        throughput = ((total_iters - warmup_iters) * args.batch_size) / (end - start)
+        print("Latency: {:.3f} ms".format(latency * 10**3))
+        print("Throughput: {:.3f} samples/sec".format(throughput))
+        print('Batch size = %d' % args.batch_size)
+
+
+if __name__ == "__main__":
+    main()